You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t.S 36 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define SP r12
  41. #define M r32
  42. #define N r33
  43. #define A r37
  44. #define LDA r38
  45. #define X r39
  46. #define INCX r34
  47. #define Y r35
  48. #define INCY r36
  49. #define BUFFER r11
  50. #define I r15
  51. #define J r16
  52. #define AO1 r18
  53. #define AO2 r19
  54. #define AO3 r20
  55. #define AO4 r21
  56. #define AO5 r22
  57. #define AO6 r23
  58. #define AO7 r24
  59. #define AO8 r25
  60. #define BO r26
  61. #define INCYM1 r28
  62. #define RPRE1 loc0
  63. #define RPRE2 loc1
  64. #define RPRE3 loc2
  65. #define RPRE4 loc3
  66. #define RPRE5 loc4
  67. #define RPRE6 loc5
  68. #define RPRE7 loc6
  69. #define RPRE8 loc7
  70. #define AO21 loc8
  71. #define AO41 loc9
  72. #define AO61 loc10
  73. #define AO81 loc11
  74. #define CLD1 loc12
  75. #define CLD2 loc13
  76. #define CST1 loc14
  77. #define CST2 loc15
  78. #define PREB r8
  79. #define WPRE r9
  80. #define OFFSET PREB
  81. #define INCX3M1 WPRE
  82. #define INCY3M1 r10
  83. #define ARLC r29
  84. #define PR r30
  85. #define ARPFS r31
  86. #ifdef DOUBLE
  87. #define RPREFETCH (16 * 2 + 8)
  88. #else
  89. #define RPREFETCH (16 * 2 + 16)
  90. #endif
  91. #define PREFETCH lfetch.nt1
  92. #define ALPHA_R f6
  93. #define ALPHA_I f7
  94. #if !defined(CONJ) && !defined(XCONJ)
  95. #define ADD1 FMA
  96. #define ADD2 FMA
  97. #define ADD3 FNMA
  98. #define ADD4 FMA
  99. #elif defined(CONJ) && !defined(XCONJ)
  100. #define ADD1 FMA
  101. #define ADD2 FMA
  102. #define ADD3 FMA
  103. #define ADD4 FNMA
  104. #elif !defined(CONJ) && defined(XCONJ)
  105. #define ADD1 FMA
  106. #define ADD2 FNMA
  107. #define ADD3 FMA
  108. #define ADD4 FMA
  109. #else
  110. #define ADD1 FMA
  111. #define ADD2 FNMA
  112. #define ADD3 FNMA
  113. #define ADD4 FNMA
  114. #endif
  115. PROLOGUE
  116. PROFCODE
  117. .prologue
  118. { .mmi
  119. .save ar.pfs, ARPFS
  120. alloc ARPFS = ar.pfs, 8, 16, 0, 0
  121. adds r14 = 16, SP
  122. mov ARLC = ar.lc
  123. }
  124. { .mmi
  125. adds r8 = -8 * 16, SP
  126. adds r9 = -7 * 16, SP
  127. adds SP = -8 * 16, SP
  128. }
  129. ;;
  130. { .mmi
  131. stf.spill [r8] = f16, 32
  132. stf.spill [r9] = f17, 32
  133. mov PR = pr
  134. }
  135. ;;
  136. { .mmi
  137. stf.spill [r8] = f18, 32
  138. stf.spill [r9] = f19, 32
  139. adds r15 = 152, SP
  140. }
  141. ;;
  142. { .mmi
  143. stf.spill [r8] = f20, 32
  144. stf.spill [r9] = f21, 32
  145. adds r16 = 160, SP
  146. }
  147. ;;
  148. { .mmi
  149. stf.spill [r8] = f22
  150. stf.spill [r9] = f23
  151. adds r17 = 168, SP
  152. }
  153. .body
  154. ;;
  155. { .mmf
  156. ld8 INCX = [r14]
  157. ld8 Y = [r15]
  158. mov ALPHA_R = f8
  159. }
  160. { .mmf
  161. ld8 INCY = [r16]
  162. ld8 BUFFER = [r17]
  163. mov ALPHA_I = f9
  164. }
  165. ;;
  166. { .mmi
  167. shladd INCX = INCX, ZBASE_SHIFT, r0
  168. shladd LDA = LDA, ZBASE_SHIFT, r0
  169. mov pr.rot= 0
  170. }
  171. { .mmi
  172. cmp.ge p7, p0 = 0, M
  173. cmp.ge p6, p0 = 0, N
  174. shladd INCY = INCY, ZBASE_SHIFT, r0
  175. }
  176. ;;
  177. { .mmi
  178. mov AO1 = BUFFER
  179. adds OFFSET = -SIZE, INCX
  180. shr I = M, 3
  181. }
  182. { .mib
  183. adds INCYM1 = - SIZE, INCY
  184. shladd INCX3M1 = INCX, 1, INCX
  185. (p7) br.cond.dpnt .L999
  186. }
  187. ;;
  188. { .mmi
  189. shladd BO = INCX, 1, X
  190. adds AO2 = 4 * SIZE, BUFFER
  191. mov ar.ec= 5
  192. }
  193. { .mmb
  194. shladd INCY3M1 = INCY, 1, INCYM1
  195. adds I = -1, I
  196. (p6) br.cond.dpnt .L999
  197. }
  198. ;;
  199. { .mmi
  200. adds INCX3M1 = -SIZE, INCX3M1
  201. cmp.eq p16, p0 = r0, r0
  202. tbit.nz p13, p0 = M, 2
  203. }
  204. { .mib
  205. cmp.gt p6, p0 = 0, I
  206. mov ar.lc = I
  207. (p6) br.cond.dpnt .L05
  208. }
  209. ;;
  210. .align 16
  211. .L01:
  212. (p20) STFD [AO1] = f36, SIZE
  213. (p20) STFD [AO2] = f56, SIZE
  214. (p16) LDFD f32 = [X], SIZE
  215. (p16) LDFD f52 = [BO], SIZE
  216. ;;
  217. (p20) STFD [AO1] = f41, SIZE
  218. (p20) STFD [AO2] = f61, SIZE
  219. (p16) LDFD f37 = [X], OFFSET
  220. (p16) LDFD f57 = [BO], OFFSET
  221. ;;
  222. (p20) STFD [AO1] = f46, SIZE
  223. (p20) STFD [AO2] = f66, SIZE
  224. (p16) LDFD f42 = [X], SIZE
  225. (p16) LDFD f62 = [BO], SIZE
  226. ;;
  227. (p20) STFD [AO1] = f51, 5 * SIZE
  228. (p20) STFD [AO2] = f71, 5 * SIZE
  229. (p16) LDFD f47 = [X], INCX3M1
  230. (p16) LDFD f67 = [BO], INCX3M1
  231. ;;
  232. (p20) STFD [AO1] = f76, SIZE
  233. (p20) STFD [AO2] = f96, SIZE
  234. (p16) LDFD f72 = [X], SIZE
  235. (p16) LDFD f92 = [BO], SIZE
  236. ;;
  237. (p20) STFD [AO1] = f81, SIZE
  238. (p20) STFD [AO2] = f101, SIZE
  239. (p16) LDFD f77 = [X], OFFSET
  240. (p16) LDFD f97 = [BO], OFFSET
  241. ;;
  242. (p20) STFD [AO1] = f86, SIZE
  243. (p20) STFD [AO2] = f106, SIZE
  244. (p16) LDFD f82 = [X], SIZE
  245. (p16) LDFD f102 = [BO], SIZE
  246. ;;
  247. (p20) STFD [AO1] = f91, 5 * SIZE
  248. (p20) STFD [AO2] = f111, 5 * SIZE
  249. (p16) LDFD f87 = [X], INCX3M1
  250. (p16) LDFD f107 = [BO], INCX3M1
  251. br.ctop.sptk.few .L01
  252. ;;
  253. .align 16
  254. .L05:
  255. { .mmi
  256. (p13) LDFD f32 = [X], SIZE
  257. (p13) LDFD f36 = [BO], SIZE
  258. tbit.nz p14, p0 = M, 1
  259. }
  260. ;;
  261. { .mmi
  262. (p13) LDFD f33 = [X], OFFSET
  263. (p13) LDFD f37 = [BO], OFFSET
  264. tbit.nz p15, p0 = M, 0
  265. }
  266. ;;
  267. { .mmb
  268. (p13) LDFD f34 = [X], SIZE
  269. (p13) LDFD f38 = [BO], SIZE
  270. }
  271. ;;
  272. { .mmi
  273. (p13) LDFD f35 = [X], INCX3M1
  274. (p13) LDFD f39 = [BO], INCX3M1
  275. }
  276. ;;
  277. { .mmi
  278. (p14) LDFD f40 = [X], SIZE
  279. }
  280. ;;
  281. (p14) LDFD f41 = [X], OFFSET
  282. (p13) STFD [AO1] = f32, SIZE
  283. tbit.nz p8, p0 = A, BASE_SHIFT
  284. ;;
  285. (p14) LDFD f42 = [X], SIZE
  286. (p13) STFD [AO2] = f36, SIZE
  287. ;;
  288. (p14) LDFD f43 = [X], OFFSET
  289. (p13) STFD [AO1] = f33, SIZE
  290. ;;
  291. (p15) LDFD f44 = [X], SIZE
  292. (p13) STFD [AO2] = f37, SIZE
  293. ;;
  294. (p15) LDFD f45 = [X], OFFSET
  295. (p13) STFD [AO1] = f34, SIZE
  296. (p13) STFD [AO2] = f38, SIZE
  297. ;;
  298. (p13) STFD [AO1] = f35, 5 * SIZE
  299. (p13) STFD [AO2] = f39, 5 * SIZE
  300. ;;
  301. (p14) STFD [AO1] = f40, SIZE
  302. ;;
  303. (p14) STFD [AO1] = f41, SIZE
  304. ;;
  305. (p14) STFD [AO1] = f42, SIZE
  306. ;;
  307. (p14) STFD [AO1] = f43, SIZE
  308. ;;
  309. (p15) STFD [AO1] = f44, SIZE
  310. ;;
  311. (p15) STFD [AO1] = f45, SIZE
  312. (p8) br.cond.dpnt .L100
  313. ;;
  314. .align 16
  315. .L10:
  316. { .mmi
  317. mov CLD1 = Y
  318. shladd CLD2 = INCY, 1, Y
  319. shr J = N, 3
  320. }
  321. ;;
  322. { .mmb
  323. mov CST1 = Y
  324. cmp.eq p6, p0 = r0, J
  325. (p6) br.cond.dpnt .L20
  326. }
  327. ;;
  328. .align 16
  329. .L11:
  330. { .mfi
  331. mov AO1 = A
  332. mov f8 = f0
  333. mov pr.rot= 0
  334. }
  335. { .mfi
  336. add AO2 = LDA, A
  337. mov f10 = f0
  338. mov BO = BUFFER
  339. }
  340. ;;
  341. { .mmf
  342. shladd AO3 = LDA, 1, A
  343. shladd AO4 = LDA, 1, AO2
  344. mov f12 = f0
  345. }
  346. { .mmf
  347. adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1
  348. adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2
  349. mov f14 = f0
  350. }
  351. ;;
  352. { .mmf
  353. shladd AO5 = LDA, 1, AO3
  354. shladd AO6 = LDA, 1, AO4
  355. mov f16 = f0
  356. }
  357. { .mmf
  358. adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3
  359. adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4
  360. mov f18 = f0
  361. }
  362. ;;
  363. { .mmf
  364. shladd AO7 = LDA, 1, AO5
  365. shladd AO8 = LDA, 1, AO6
  366. mov f20 = f0
  367. }
  368. { .mmf
  369. adds RPRE5 = (RPREFETCH + 8) * SIZE, AO5
  370. adds RPRE6 = (RPREFETCH + 10) * SIZE, AO6
  371. mov f22 = f0
  372. }
  373. ;;
  374. { .mfi
  375. shladd A = LDA, 3, A
  376. mov f9 = f0
  377. mov ar.ec= 5
  378. }
  379. { .mmf
  380. adds RPRE7 = (RPREFETCH + 12) * SIZE, AO7
  381. adds RPRE8 = (RPREFETCH + 14) * SIZE, AO8
  382. mov f11 = f0
  383. }
  384. ;;
  385. { .mmf
  386. adds WPRE = 16 * SIZE, CLD1
  387. adds PREB = RPREFETCH * SIZE, BO
  388. mov f13 = f0
  389. }
  390. { .mmf
  391. adds I = -1, M
  392. cmp.eq p16, p0 = r0, r0
  393. mov f15 = f0
  394. }
  395. ;;
  396. { .mfi
  397. cmp.eq p12, p0 = r0, r0
  398. mov f17 = f0
  399. mov ar.lc = I
  400. }
  401. { .mmf
  402. nop __LINE__
  403. nop __LINE__
  404. mov f19 = f0
  405. }
  406. ;;
  407. { .mmf
  408. lfetch.excl.nt1 [WPRE]
  409. nop __LINE__
  410. mov f21 = f0
  411. }
  412. { .mmf
  413. mov I = 0
  414. nop __LINE__
  415. mov f23 = f0
  416. }
  417. ;;
  418. .align 16
  419. .L16:
  420. { .mmf
  421. (p12) PREFETCH [RPRE1], 16 * SIZE
  422. (p16) LDFPD f32, f37 = [AO1], 2 * SIZE
  423. (p20) ADD1 f8 = f116, f36, f8
  424. }
  425. { .mmf
  426. (p16) cmp.eq.unc p13, p0 = 1, I
  427. nop __LINE__
  428. (p20) ADD2 f9 = f121, f36, f9
  429. }
  430. ;;
  431. { .mmf
  432. (p13) PREFETCH [PREB], 16 * SIZE
  433. (p16) LDFPD f112, f117 = [BO], 2 * SIZE
  434. (p20) ADD1 f10 = f116, f46, f10
  435. }
  436. { .mmf
  437. (p16) cmp.eq.unc p14, p0 = 2, I
  438. (p16) cmp.eq.unc p15, p0 = 3, I
  439. (p20) ADD2 f11 = f121, f46, f11
  440. }
  441. ;;
  442. { .mmf
  443. (p16) LDFPD f42, f47 = [AO2], 2 * SIZE
  444. nop __LINE__
  445. (p20) ADD1 f12 = f116, f56, f12
  446. }
  447. { .mmf
  448. nop __LINE__
  449. nop __LINE__
  450. (p20) ADD2 f13 = f121, f56, f13
  451. }
  452. ;;
  453. { .mmf
  454. (p13) PREFETCH [RPRE2], 16 * SIZE
  455. nop __LINE__
  456. (p20) ADD1 f14 = f116, f66, f14
  457. }
  458. { .mmf
  459. nop __LINE__
  460. nop __LINE__
  461. (p20) ADD2 f15 = f121, f66, f15
  462. }
  463. ;;
  464. { .mmf
  465. (p16) LDFPD f52, f57 = [AO3], 2 * SIZE
  466. nop __LINE__
  467. (p20) ADD3 f8 = f121, f41, f8
  468. }
  469. { .mmf
  470. nop __LINE__
  471. nop __LINE__
  472. (p20) ADD4 f9 = f116, f41, f9
  473. }
  474. ;;
  475. { .mmf
  476. (p14) PREFETCH [RPRE3], 16 * SIZE
  477. nop __LINE__
  478. (p20) ADD3 f10 = f121, f51, f10
  479. }
  480. { .mmf
  481. nop __LINE__
  482. nop __LINE__
  483. (p20) ADD4 f11 = f116, f51, f11
  484. }
  485. ;;
  486. { .mmf
  487. (p16) LDFPD f62, f67 = [AO4], 2 * SIZE
  488. nop __LINE__
  489. (p20) ADD3 f12 = f121, f61, f12
  490. }
  491. { .mmf
  492. (p16) cmp.eq.unc p12, p0 = 4, I
  493. (p16) cmp.eq.unc p13, p0 = 5, I
  494. (p20) ADD4 f13 = f116, f61, f13
  495. }
  496. ;;
  497. { .mmf
  498. (p15) PREFETCH [RPRE4], 16 * SIZE
  499. nop __LINE__
  500. (p20) ADD3 f14 = f121, f71, f14
  501. }
  502. { .mmf
  503. (p16) cmp.eq.unc p14, p0 = 6, I
  504. (p16) cmp.eq.unc p15, p0 = 7, I
  505. (p20) ADD4 f15 = f116, f71, f15
  506. }
  507. ;;
  508. { .mmf
  509. (p16) LDFPD f72, f77 = [AO5], 2 * SIZE
  510. nop __LINE__
  511. (p20) ADD1 f16 = f116, f76, f16
  512. }
  513. { .mmf
  514. nop __LINE__
  515. nop __LINE__
  516. (p20) ADD2 f17 = f121, f76, f17
  517. }
  518. ;;
  519. { .mmf
  520. (p12) PREFETCH [RPRE5], 16 * SIZE
  521. nop __LINE__
  522. (p20) ADD1 f18 = f116, f86, f18
  523. }
  524. { .mmf
  525. nop __LINE__
  526. nop __LINE__
  527. (p20) ADD2 f19 = f121, f86, f19
  528. }
  529. ;;
  530. { .mmf
  531. (p16) LDFPD f82, f87 = [AO6], 2 * SIZE
  532. nop __LINE__
  533. (p20) ADD1 f20 = f116, f96, f20
  534. }
  535. { .mmf
  536. nop __LINE__
  537. nop __LINE__
  538. (p20) ADD2 f21 = f121, f96, f21
  539. }
  540. ;;
  541. { .mmf
  542. (p13) PREFETCH [RPRE6], 16 * SIZE
  543. nop __LINE__
  544. (p20) ADD1 f22 = f116, f106, f22
  545. }
  546. { .mmf
  547. nop __LINE__
  548. nop __LINE__
  549. (p20) ADD2 f23 = f121, f106, f23
  550. }
  551. ;;
  552. { .mmf
  553. (p16) LDFPD f92, f97 = [AO7], 2 * SIZE
  554. nop __LINE__
  555. (p20) ADD3 f16 = f121, f81, f16
  556. }
  557. { .mmf
  558. nop __LINE__
  559. nop __LINE__
  560. (p20) ADD4 f17 = f116, f81, f17
  561. }
  562. ;;
  563. { .mmf
  564. (p14) PREFETCH [RPRE7], 16 * SIZE
  565. nop __LINE__
  566. (p20) ADD3 f18 = f121, f91, f18
  567. }
  568. { .mmf
  569. nop __LINE__
  570. (p16) adds I = 1, I
  571. (p20) ADD4 f19 = f116, f91, f19
  572. }
  573. ;;
  574. { .mmf
  575. (p16) LDFPD f102, f107 = [AO8], 2 * SIZE
  576. nop __LINE__
  577. (p20) ADD3 f20 = f121, f101, f20
  578. }
  579. { .mmf
  580. (p15) mov I = 0
  581. nop __LINE__
  582. (p20) ADD4 f21 = f116, f101, f21
  583. }
  584. ;;
  585. { .mmf
  586. (p15) PREFETCH [RPRE8], 16 * SIZE
  587. nop __LINE__
  588. (p20) ADD3 f22 = f121, f111, f22
  589. }
  590. { .mfb
  591. (p16) cmp.eq.unc p12, p0 = 0, I
  592. (p20) ADD4 f23 = f116, f111, f23
  593. br.ctop.sptk.few .L16
  594. }
  595. ;;
  596. .L18:
  597. LDFD f32 = [CLD1], SIZE
  598. LDFD f36 = [CLD2], SIZE
  599. shladd CST2 = INCY, 1, CST1
  600. ;;
  601. LDFD f33 = [CLD1], INCYM1
  602. LDFD f37 = [CLD2], INCYM1
  603. ;;
  604. LDFD f34 = [CLD1], SIZE
  605. LDFD f38 = [CLD2], SIZE
  606. ;;
  607. LDFD f35 = [CLD1], INCY3M1
  608. LDFD f39 = [CLD2], INCY3M1
  609. ;;
  610. LDFD f40 = [CLD1], SIZE
  611. LDFD f44 = [CLD2], SIZE
  612. ;;
  613. LDFD f41 = [CLD1], INCYM1
  614. LDFD f45 = [CLD2], INCYM1
  615. ;;
  616. LDFD f42 = [CLD1], SIZE
  617. LDFD f46 = [CLD2], SIZE
  618. ;;
  619. LDFD f43 = [CLD1], INCY3M1
  620. LDFD f47 = [CLD2], INCY3M1
  621. ;;
  622. FMA f32 = ALPHA_R, f8, f32
  623. FMA f36 = ALPHA_R, f12, f36
  624. FMA f33 = ALPHA_I, f8, f33
  625. FMA f37 = ALPHA_I, f12, f37
  626. FMA f34 = ALPHA_R, f10, f34
  627. FMA f38 = ALPHA_R, f14, f38
  628. FMA f35 = ALPHA_I, f10, f35
  629. FMA f39 = ALPHA_I, f14, f39
  630. ;;
  631. FNMA f32 = ALPHA_I, f9, f32
  632. FNMA f36 = ALPHA_I, f13, f36
  633. FMA f33 = ALPHA_R, f9, f33
  634. FMA f37 = ALPHA_R, f13, f37
  635. FNMA f34 = ALPHA_I, f11, f34
  636. FNMA f38 = ALPHA_I, f15, f38
  637. FMA f35 = ALPHA_R, f11, f35
  638. FMA f39 = ALPHA_R, f15, f39
  639. ;;
  640. FMA f40 = ALPHA_R, f16, f40
  641. FMA f44 = ALPHA_R, f20, f44
  642. FMA f41 = ALPHA_I, f16, f41
  643. FMA f45 = ALPHA_I, f20, f45
  644. FMA f42 = ALPHA_R, f18, f42
  645. FMA f46 = ALPHA_R, f22, f46
  646. FMA f43 = ALPHA_I, f18, f43
  647. FMA f47 = ALPHA_I, f22, f47
  648. ;;
  649. { .mmf
  650. STFD [CST1] = f32, SIZE
  651. STFD [CST2] = f36, SIZE
  652. FNMA f40 = ALPHA_I, f17, f40
  653. }
  654. { .mmf
  655. nop __LINE__
  656. nop __LINE__
  657. FNMA f44 = ALPHA_I, f21, f44
  658. }
  659. ;;
  660. { .mmf
  661. STFD [CST1] = f33
  662. STFD [CST2] = f37
  663. FMA f41 = ALPHA_R, f17, f41
  664. }
  665. { .mmf
  666. add CST1 = CST1, INCYM1
  667. add CST2 = CST2, INCYM1
  668. FMA f45 = ALPHA_R, f21, f45
  669. }
  670. ;;
  671. { .mmf
  672. STFD [CST1] = f34, SIZE
  673. STFD [CST2] = f38, SIZE
  674. FNMA f42 = ALPHA_I, f19, f42
  675. }
  676. { .mmf
  677. nop __LINE__
  678. nop __LINE__
  679. FNMA f46 = ALPHA_I, f23, f46
  680. }
  681. ;;
  682. { .mmf
  683. STFD [CST1] = f35
  684. STFD [CST2] = f39
  685. FMA f43 = ALPHA_R, f19, f43
  686. }
  687. { .mmf
  688. add CST1 = CST1, INCY3M1
  689. add CST2 = CST2, INCY3M1
  690. FMA f47 = ALPHA_R, f23, f47
  691. }
  692. ;;
  693. { .mmi
  694. STFD [CST1] = f40, SIZE
  695. STFD [CST2] = f44, SIZE
  696. adds J = -1, J
  697. }
  698. ;;
  699. { .mmi
  700. STFD [CST1] = f41
  701. STFD [CST2] = f45
  702. add CST1 = CST1, INCYM1
  703. }
  704. { .mmi
  705. nop __LINE__
  706. nop __LINE__
  707. add CST2 = CST2, INCYM1
  708. }
  709. ;;
  710. { .mmi
  711. STFD [CST1] = f42, SIZE
  712. STFD [CST2] = f46, SIZE
  713. cmp.lt p6, p0 = 0, J
  714. }
  715. ;;
  716. { .mmi
  717. STFD [CST1] = f43
  718. STFD [CST2] = f47
  719. add CST1 = CST1, INCY3M1
  720. }
  721. { .mmb
  722. add CST2 = CST2, INCY3M1
  723. (p6) br.cond.dptk .L11
  724. }
  725. ;;
  726. .align 16
  727. .L20:
  728. { .mfi
  729. mov AO1 = A
  730. mov f8 = f0
  731. mov pr.rot= 0
  732. }
  733. { .mfi
  734. add AO2 = LDA, A
  735. mov f10 = f0
  736. tbit.z p6, p0 = N, 2
  737. }
  738. ;;
  739. { .mmf
  740. shladd AO3 = LDA, 1, A
  741. shladd AO4 = LDA, 1, AO2
  742. mov f12 = f0
  743. }
  744. { .mfb
  745. mov BO = BUFFER
  746. mov f14 = f0
  747. (p6) br.cond.dpnt .L30
  748. }
  749. ;;
  750. { .mfi
  751. adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1
  752. mov f9 = f0
  753. mov ar.ec= 5
  754. }
  755. { .mmf
  756. adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2
  757. adds I = -1, M
  758. mov f11 = f0
  759. }
  760. ;;
  761. { .mmf
  762. adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3
  763. adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4
  764. mov f13 = f0
  765. }
  766. { .mmf
  767. cmp.eq p16, p0 = r0, r0
  768. shladd A = LDA, 2, A
  769. mov f15 = f0
  770. }
  771. ;;
  772. { .mmi
  773. lfetch.excl.nt1 [WPRE]
  774. adds PREB = RPREFETCH * SIZE, BO
  775. mov ar.lc = I
  776. }
  777. { .mmi
  778. adds WPRE = 16 * SIZE, CLD1
  779. cmp.eq p12, p0 = r0, r0
  780. mov I = 0
  781. }
  782. ;;
  783. .align 16
  784. .L26:
  785. { .mmf
  786. (p12) PREFETCH [RPRE1], 16 * SIZE
  787. (p16) LDFPD f32, f37 = [AO1], 2 * SIZE
  788. (p20) ADD1 f8 = f116, f36, f8
  789. }
  790. { .mmf
  791. (p16) cmp.eq.unc p13, p0 = 2, I
  792. nop __LINE__
  793. (p20) ADD2 f9 = f121, f36, f9
  794. }
  795. ;;
  796. { .mmf
  797. (p12) PREFETCH [PREB], 16 * SIZE
  798. (p16) LDFPD f112, f117 = [BO], 2 * SIZE
  799. (p20) ADD1 f10 = f116, f46, f10
  800. }
  801. { .mmf
  802. (p16) cmp.eq.unc p14, p0 = 4, I
  803. (p16) cmp.eq.unc p15, p0 = 6, I
  804. (p20) ADD2 f11 = f121, f46, f11
  805. }
  806. ;;
  807. { .mmf
  808. (p16) LDFPD f42, f47 = [AO2], 2 * SIZE
  809. nop __LINE__
  810. (p20) ADD1 f12 = f116, f56, f12
  811. }
  812. { .mmf
  813. nop __LINE__
  814. nop __LINE__
  815. (p20) ADD2 f13 = f121, f56, f13
  816. }
  817. ;;
  818. { .mmf
  819. (p13) PREFETCH [RPRE2], 16 * SIZE
  820. nop __LINE__
  821. (p20) ADD1 f14 = f116, f66, f14
  822. }
  823. { .mmf
  824. nop __LINE__
  825. nop __LINE__
  826. (p20) ADD2 f15 = f121, f66, f15
  827. }
  828. ;;
  829. { .mmf
  830. (p16) LDFPD f52, f57 = [AO3], 2 * SIZE
  831. nop __LINE__
  832. (p20) ADD3 f8 = f121, f41, f8
  833. }
  834. { .mmf
  835. (p16) adds I = 1, I
  836. nop __LINE__
  837. (p20) ADD4 f9 = f116, f41, f9
  838. }
  839. ;;
  840. { .mmf
  841. (p14) PREFETCH [RPRE3], 16 * SIZE
  842. nop __LINE__
  843. (p20) ADD3 f10 = f121, f51, f10
  844. }
  845. { .mmf
  846. (p16) cmp.eq.unc p15, p0 = 8, I
  847. nop __LINE__
  848. (p20) ADD4 f11 = f116, f51, f11
  849. }
  850. ;;
  851. { .mmf
  852. (p16) LDFPD f62, f67 = [AO4], 2 * SIZE
  853. nop __LINE__
  854. (p20) ADD3 f12 = f121, f61, f12
  855. }
  856. { .mmf
  857. (p15) mov I = 0
  858. nop __LINE__
  859. (p20) ADD4 f13 = f116, f61, f13
  860. }
  861. ;;
  862. { .mmf
  863. (p15) PREFETCH [RPRE4], 16 * SIZE
  864. nop __LINE__
  865. (p20) ADD3 f14 = f121, f71, f14
  866. }
  867. { .mfb
  868. (p16) cmp.eq.unc p12, p0 = 0, I
  869. (p20) ADD4 f15 = f116, f71, f15
  870. br.ctop.sptk.few .L26
  871. }
  872. ;;
  873. .L28:
  874. LDFD f32 = [CLD1], SIZE
  875. LDFD f36 = [CLD2], SIZE
  876. shladd CST2 = INCY, 1, CST1
  877. ;;
  878. LDFD f33 = [CLD1], INCYM1
  879. LDFD f37 = [CLD2], INCYM1
  880. ;;
  881. LDFD f34 = [CLD1], SIZE
  882. LDFD f38 = [CLD2], SIZE
  883. ;;
  884. LDFD f35 = [CLD1], INCY3M1
  885. LDFD f39 = [CLD2], INCY3M1
  886. ;;
  887. FMA f32 = ALPHA_R, f8, f32
  888. FMA f36 = ALPHA_R, f12, f36
  889. FMA f33 = ALPHA_I, f8, f33
  890. FMA f37 = ALPHA_I, f12, f37
  891. FMA f34 = ALPHA_R, f10, f34
  892. FMA f38 = ALPHA_R, f14, f38
  893. FMA f35 = ALPHA_I, f10, f35
  894. FMA f39 = ALPHA_I, f14, f39
  895. ;;
  896. FNMA f32 = ALPHA_I, f9, f32
  897. FNMA f36 = ALPHA_I, f13, f36
  898. FMA f33 = ALPHA_R, f9, f33
  899. FMA f37 = ALPHA_R, f13, f37
  900. FNMA f34 = ALPHA_I, f11, f34
  901. FNMA f38 = ALPHA_I, f15, f38
  902. FMA f35 = ALPHA_R, f11, f35
  903. FMA f39 = ALPHA_R, f15, f39
  904. ;;
  905. STFD [CST1] = f32, SIZE
  906. STFD [CST2] = f36, SIZE
  907. ;;
  908. STFD [CST1] = f33
  909. STFD [CST2] = f37
  910. add CST1 = CST1, INCYM1
  911. add CST2 = CST2, INCYM1
  912. ;;
  913. STFD [CST1] = f34, SIZE
  914. STFD [CST2] = f38, SIZE
  915. ;;
  916. STFD [CST1] = f35
  917. STFD [CST2] = f39
  918. add CST1 = CST1, INCY3M1
  919. add CST2 = CST2, INCY3M1
  920. ;;
  921. .align 16
  922. .L30:
  923. { .mfi
  924. mov AO1 = A
  925. mov f8 = f0
  926. mov pr.rot= 0
  927. }
  928. { .mfi
  929. add AO2 = LDA, A
  930. mov f10 = f0
  931. tbit.z p6, p0 = N, 1
  932. }
  933. ;;
  934. { .mmf
  935. adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1
  936. adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2
  937. mov f12 = f0
  938. }
  939. { .mfb
  940. adds I = -1, M
  941. mov f14 = f0
  942. (p6) br.cond.dpnt .L40
  943. }
  944. ;;
  945. { .mfi
  946. mov BO = BUFFER
  947. mov f9 = f0
  948. mov ar.ec= 5
  949. }
  950. { .mmf
  951. cmp.eq p16, p0 = r0, r0
  952. shladd A = LDA, 1, A
  953. mov f11 = f0
  954. }
  955. ;;
  956. { .mfi
  957. adds WPRE = 16 * SIZE, CLD1
  958. mov f13 = f0
  959. mov ar.lc = I
  960. }
  961. { .mmf
  962. adds PREB = RPREFETCH * SIZE, BO
  963. nop __LINE__
  964. mov f15 = f0
  965. }
  966. ;;
  967. { .mmi
  968. lfetch.excl.nt1 [WPRE]
  969. cmp.eq p12, p0 = r0, r0
  970. mov I = 0
  971. }
  972. ;;
  973. .align 16
  974. .L36:
  975. { .mmf
  976. (p12) PREFETCH [RPRE1], 16 * SIZE
  977. (p16) LDFPD f32, f37 = [AO1], 2 * SIZE
  978. (p20) ADD1 f8 = f116, f36, f8
  979. }
  980. { .mmf
  981. (p16) cmp.eq.unc p13, p0 = 4, I
  982. (p16) adds I = 1, I
  983. (p20) ADD2 f9 = f121, f36, f9
  984. }
  985. ;;
  986. { .mmf
  987. (p12) PREFETCH [PREB], 16 * SIZE
  988. (p16) LDFPD f112, f117 = [BO], 2 * SIZE
  989. (p20) ADD1 f10 = f116, f46, f10
  990. }
  991. { .mmf
  992. (p16) cmp.eq.unc p12, p0 = 8, I
  993. (p20) ADD2 f11 = f121, f46, f11
  994. }
  995. ;;
  996. { .mmf
  997. (p13) PREFETCH [RPRE2], 16 * SIZE
  998. (p16) LDFPD f42, f47 = [AO2], 2 * SIZE
  999. (p20) ADD3 f12 = f121, f41, f12
  1000. }
  1001. { .mmf
  1002. (p12) mov I = 0
  1003. (p20) ADD4 f13 = f116, f41, f13
  1004. }
  1005. ;;
  1006. { .mmf
  1007. (p20) ADD3 f14 = f121, f51, f14
  1008. }
  1009. { .mfb
  1010. nop __LINE__
  1011. (p20) ADD4 f15 = f116, f51, f15
  1012. br.ctop.sptk.few .L36
  1013. }
  1014. ;;
  1015. .L38:
  1016. LDFD f32 = [CLD1], SIZE
  1017. FADD f8 = f8, f12
  1018. shladd CST2 = INCY, 1, CST1
  1019. ;;
  1020. LDFD f33 = [CLD1], INCYM1
  1021. FADD f10 = f10, f14
  1022. ;;
  1023. LDFD f34 = [CLD1], SIZE
  1024. FADD f9 = f9, f13
  1025. ;;
  1026. LDFD f35 = [CLD1], INCYM1
  1027. FADD f11 = f11, f15
  1028. ;;
  1029. FMA f32 = ALPHA_R, f8, f32
  1030. FMA f33 = ALPHA_I, f8, f33
  1031. FMA f34 = ALPHA_R, f10, f34
  1032. FMA f35 = ALPHA_I, f10, f35
  1033. ;;
  1034. FNMA f32 = ALPHA_I, f9, f32
  1035. FMA f33 = ALPHA_R, f9, f33
  1036. FNMA f34 = ALPHA_I, f11, f34
  1037. FMA f35 = ALPHA_R, f11, f35
  1038. ;;
  1039. STFD [CST1] = f32, SIZE
  1040. ;;
  1041. STFD [CST1] = f33
  1042. add CST1 = CST1, INCYM1
  1043. ;;
  1044. STFD [CST1] = f34, SIZE
  1045. ;;
  1046. STFD [CST1] = f35
  1047. add CST1 = CST1, INCYM1
  1048. ;;
  1049. .align 16
  1050. .L40:
  1051. { .mfi
  1052. mov AO1 = A
  1053. mov f8 = f0
  1054. mov pr.rot= 0
  1055. }
  1056. { .mfi
  1057. mov f9 = f0
  1058. tbit.z p6, p0 = N, 0
  1059. }
  1060. ;;
  1061. { .mfi
  1062. adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1
  1063. mov f10 = f0
  1064. mov ar.ec= 5
  1065. }
  1066. { .mfb
  1067. adds I = -1, M
  1068. mov f11 = f0
  1069. (p6) br.cond.dpnt .L999
  1070. }
  1071. ;;
  1072. { .mmi
  1073. cmp.eq p16, p0 = r0, r0
  1074. add A = LDA, A
  1075. mov ar.lc = I
  1076. }
  1077. { .mmi
  1078. adds WPRE = 16 * SIZE, CLD1
  1079. adds PREB = RPREFETCH * SIZE, BO
  1080. mov BO = BUFFER
  1081. }
  1082. ;;
  1083. { .mmi
  1084. lfetch.excl.nt1 [WPRE]
  1085. cmp.eq p12, p0 = r0, r0
  1086. mov I = 0
  1087. }
  1088. ;;
  1089. .align 16
  1090. .L46:
  1091. { .mmf
  1092. (p12) PREFETCH [RPRE1], 16 * SIZE
  1093. (p16) LDFPD f32, f37 = [AO1], 2 * SIZE
  1094. (p20) ADD1 f8 = f116, f36, f8
  1095. }
  1096. { .mmf
  1097. (p16) cmp.eq.unc p12, p0 = 7, I
  1098. (p16) adds I = 1, I
  1099. (p20) ADD2 f9 = f121, f36, f9
  1100. }
  1101. ;;
  1102. { .mmf
  1103. (p16) LDFPD f112, f117 = [BO], 2 * SIZE
  1104. (p20) ADD3 f10 = f121, f41, f10
  1105. }
  1106. { .mfb
  1107. (p12) mov I = 0
  1108. (p20) ADD4 f11 = f116, f41, f11
  1109. br.ctop.sptk.few .L46
  1110. }
  1111. ;;
  1112. .L48:
  1113. LDFD f32 = [CLD1], SIZE
  1114. FADD f8 = f8, f10
  1115. shladd CST2 = INCY, 1, CST1
  1116. ;;
  1117. LDFD f33 = [CLD1], INCYM1
  1118. FADD f9 = f9, f11
  1119. ;;
  1120. FMA f32 = ALPHA_R, f8, f32
  1121. FMA f33 = ALPHA_I, f8, f33
  1122. ;;
  1123. FNMA f32 = ALPHA_I, f9, f32
  1124. FMA f33 = ALPHA_R, f9, f33
  1125. ;;
  1126. STFD [CST1] = f32, SIZE
  1127. ;;
  1128. STFD [CST1] = f33
  1129. add CST1 = CST1, INCYM1
  1130. br .L999
  1131. .align 16
  1132. ;;
  1133. .L100:
  1134. { .mmi
  1135. mov CLD1 = Y
  1136. shladd CLD2 = INCY, 1, Y
  1137. shr J = N, 3
  1138. }
  1139. ;;
  1140. { .mmb
  1141. mov CST1 = Y
  1142. cmp.eq p6, p0 = r0, J
  1143. (p6) br.cond.dpnt .L120
  1144. }
  1145. ;;
  1146. .align 16
  1147. .L111:
  1148. { .mfi
  1149. mov AO1 = A
  1150. mov f8 = f0
  1151. mov pr.rot= 0
  1152. }
  1153. { .mfi
  1154. add AO2 = LDA, A
  1155. mov f10 = f0
  1156. mov BO = BUFFER
  1157. }
  1158. ;;
  1159. { .mmf
  1160. shladd AO3 = LDA, 1, A
  1161. shladd AO4 = LDA, 1, AO2
  1162. mov f12 = f0
  1163. }
  1164. { .mmf
  1165. adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1
  1166. adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2
  1167. mov f14 = f0
  1168. }
  1169. ;;
  1170. { .mmf
  1171. shladd AO5 = LDA, 1, AO3
  1172. shladd AO6 = LDA, 1, AO4
  1173. mov f16 = f0
  1174. }
  1175. { .mmf
  1176. adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3
  1177. adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4
  1178. mov f18 = f0
  1179. }
  1180. ;;
  1181. { .mmf
  1182. shladd AO7 = LDA, 1, AO5
  1183. shladd AO8 = LDA, 1, AO6
  1184. mov f20 = f0
  1185. }
  1186. { .mmf
  1187. adds RPRE5 = (RPREFETCH + 8) * SIZE, AO5
  1188. adds RPRE6 = (RPREFETCH + 10) * SIZE, AO6
  1189. mov f22 = f0
  1190. }
  1191. ;;
  1192. { .mfi
  1193. shladd A = LDA, 3, A
  1194. mov f9 = f0
  1195. mov ar.ec= 5
  1196. }
  1197. { .mmf
  1198. adds RPRE7 = (RPREFETCH + 12) * SIZE, AO7
  1199. adds RPRE8 = (RPREFETCH + 14) * SIZE, AO8
  1200. mov f11 = f0
  1201. }
  1202. ;;
  1203. { .mmf
  1204. adds WPRE = 16 * SIZE, CLD1
  1205. adds PREB = RPREFETCH * SIZE, BO
  1206. mov f13 = f0
  1207. }
  1208. { .mmf
  1209. adds I = -1, M
  1210. cmp.eq p16, p0 = r0, r0
  1211. mov f15 = f0
  1212. }
  1213. ;;
  1214. { .mfi
  1215. cmp.eq p12, p0 = r0, r0
  1216. mov f17 = f0
  1217. mov ar.lc = I
  1218. }
  1219. { .mmf
  1220. nop __LINE__
  1221. nop __LINE__
  1222. mov f19 = f0
  1223. }
  1224. ;;
  1225. { .mmf
  1226. lfetch.excl.nt1 [WPRE]
  1227. nop __LINE__
  1228. mov f21 = f0
  1229. }
  1230. { .mmf
  1231. mov I = 0
  1232. nop __LINE__
  1233. mov f23 = f0
  1234. }
  1235. ;;
  1236. .align 16
  1237. .L116:
  1238. { .mmf
  1239. (p12) PREFETCH [RPRE1], 16 * SIZE
  1240. (p16) LDFD f32 = [AO1], 1 * SIZE
  1241. (p20) ADD1 f8 = f116, f36, f8
  1242. }
  1243. { .mmf
  1244. (p16) cmp.eq.unc p13, p0 = 1, I
  1245. (p16) cmp.eq.unc p14, p0 = 2, I
  1246. (p20) ADD2 f9 = f121, f36, f9
  1247. }
  1248. ;;
  1249. { .mmf
  1250. (p13) PREFETCH [PREB], 16 * SIZE
  1251. (p16) LDFPD f112, f117 = [BO], 2 * SIZE
  1252. (p20) ADD1 f10 = f116, f46, f10
  1253. }
  1254. { .mmf
  1255. (p16) LDFD f37 = [AO1], 1 * SIZE
  1256. (p16) cmp.eq.unc p15, p0 = 3, I
  1257. (p20) ADD2 f11 = f121, f46, f11
  1258. }
  1259. ;;
  1260. { .mmf
  1261. (p13) PREFETCH [RPRE2], 16 * SIZE
  1262. (p16) LDFD f42 = [AO2], 1 * SIZE
  1263. (p20) ADD1 f12 = f116, f56, f12
  1264. }
  1265. { .mmf
  1266. nop __LINE__
  1267. nop __LINE__
  1268. (p20) ADD2 f13 = f121, f56, f13
  1269. }
  1270. ;;
  1271. { .mmf
  1272. (p16) LDFD f47 = [AO2], 1 * SIZE
  1273. nop __LINE__
  1274. (p20) ADD1 f14 = f116, f66, f14
  1275. }
  1276. { .mmf
  1277. nop __LINE__
  1278. nop __LINE__
  1279. (p20) ADD2 f15 = f121, f66, f15
  1280. }
  1281. ;;
  1282. { .mmf
  1283. (p14) PREFETCH [RPRE3], 16 * SIZE
  1284. (p16) LDFD f52 = [AO3], 1 * SIZE
  1285. (p20) ADD3 f8 = f121, f41, f8
  1286. }
  1287. { .mmf
  1288. nop __LINE__
  1289. nop __LINE__
  1290. (p20) ADD4 f9 = f116, f41, f9
  1291. }
  1292. ;;
  1293. { .mmf
  1294. (p16) LDFD f57 = [AO3], 1 * SIZE
  1295. nop __LINE__
  1296. (p20) ADD3 f10 = f121, f51, f10
  1297. }
  1298. { .mmf
  1299. nop __LINE__
  1300. nop __LINE__
  1301. (p20) ADD4 f11 = f116, f51, f11
  1302. }
  1303. ;;
  1304. { .mmf
  1305. (p15) PREFETCH [RPRE4], 16 * SIZE
  1306. (p16) LDFD f62 = [AO4], 1 * SIZE
  1307. (p20) ADD3 f12 = f121, f61, f12
  1308. }
  1309. { .mmf
  1310. (p16) cmp.eq.unc p12, p0 = 4, I
  1311. (p16) cmp.eq.unc p13, p0 = 5, I
  1312. (p20) ADD4 f13 = f116, f61, f13
  1313. }
  1314. ;;
  1315. { .mmf
  1316. (p16) LDFD f67 = [AO4], 1 * SIZE
  1317. nop __LINE__
  1318. (p20) ADD3 f14 = f121, f71, f14
  1319. }
  1320. { .mmf
  1321. (p16) cmp.eq.unc p14, p0 = 6, I
  1322. (p16) cmp.eq.unc p15, p0 = 7, I
  1323. (p20) ADD4 f15 = f116, f71, f15
  1324. }
  1325. ;;
  1326. { .mmf
  1327. (p12) PREFETCH [RPRE5], 16 * SIZE
  1328. (p16) LDFD f72 = [AO5], 1 * SIZE
  1329. (p20) ADD1 f16 = f116, f76, f16
  1330. }
  1331. { .mmf
  1332. nop __LINE__
  1333. nop __LINE__
  1334. (p20) ADD2 f17 = f121, f76, f17
  1335. }
  1336. ;;
  1337. { .mmf
  1338. (p16) LDFD f77 = [AO5], 1 * SIZE
  1339. nop __LINE__
  1340. (p20) ADD1 f18 = f116, f86, f18
  1341. }
  1342. { .mmf
  1343. nop __LINE__
  1344. nop __LINE__
  1345. (p20) ADD2 f19 = f121, f86, f19
  1346. }
  1347. ;;
  1348. { .mmf
  1349. (p13) PREFETCH [RPRE6], 16 * SIZE
  1350. (p16) LDFD f82 = [AO6], 1 * SIZE
  1351. (p20) ADD1 f20 = f116, f96, f20
  1352. }
  1353. { .mmf
  1354. nop __LINE__
  1355. nop __LINE__
  1356. (p20) ADD2 f21 = f121, f96, f21
  1357. }
  1358. ;;
  1359. { .mmf
  1360. (p16) LDFD f87 = [AO6], 1 * SIZE
  1361. nop __LINE__
  1362. (p20) ADD1 f22 = f116, f106, f22
  1363. }
  1364. { .mmf
  1365. nop __LINE__
  1366. nop __LINE__
  1367. (p20) ADD2 f23 = f121, f106, f23
  1368. }
  1369. ;;
  1370. { .mmf
  1371. (p14) PREFETCH [RPRE7], 16 * SIZE
  1372. (p16) LDFD f92 = [AO7], 1 * SIZE
  1373. (p20) ADD3 f16 = f121, f81, f16
  1374. }
  1375. { .mmf
  1376. nop __LINE__
  1377. nop __LINE__
  1378. (p20) ADD4 f17 = f116, f81, f17
  1379. }
  1380. ;;
  1381. { .mmf
  1382. (p16) LDFD f97 = [AO7], 1 * SIZE
  1383. nop __LINE__
  1384. (p20) ADD3 f18 = f121, f91, f18
  1385. }
  1386. { .mmf
  1387. nop __LINE__
  1388. (p16) adds I = 1, I
  1389. (p20) ADD4 f19 = f116, f91, f19
  1390. }
  1391. ;;
  1392. { .mmf
  1393. (p15) PREFETCH [RPRE8], 16 * SIZE
  1394. (p16) LDFD f102 = [AO8], 1 * SIZE
  1395. (p20) ADD3 f20 = f121, f101, f20
  1396. }
  1397. { .mmf
  1398. (p15) mov I = 0
  1399. nop __LINE__
  1400. (p20) ADD4 f21 = f116, f101, f21
  1401. }
  1402. ;;
  1403. { .mmf
  1404. (p16) LDFD f107 = [AO8], 1 * SIZE
  1405. nop __LINE__
  1406. (p20) ADD3 f22 = f121, f111, f22
  1407. }
  1408. { .mfb
  1409. (p16) cmp.eq.unc p12, p0 = 0, I
  1410. (p20) ADD4 f23 = f116, f111, f23
  1411. br.ctop.sptk.few .L116
  1412. }
  1413. ;;
  1414. .L118:
  1415. LDFD f32 = [CLD1], SIZE
  1416. LDFD f36 = [CLD2], SIZE
  1417. shladd CST2 = INCY, 1, CST1
  1418. ;;
  1419. LDFD f33 = [CLD1], INCYM1
  1420. LDFD f37 = [CLD2], INCYM1
  1421. ;;
  1422. LDFD f34 = [CLD1], SIZE
  1423. LDFD f38 = [CLD2], SIZE
  1424. ;;
  1425. LDFD f35 = [CLD1], INCY3M1
  1426. LDFD f39 = [CLD2], INCY3M1
  1427. ;;
  1428. LDFD f40 = [CLD1], SIZE
  1429. LDFD f44 = [CLD2], SIZE
  1430. ;;
  1431. LDFD f41 = [CLD1], INCYM1
  1432. LDFD f45 = [CLD2], INCYM1
  1433. ;;
  1434. LDFD f42 = [CLD1], SIZE
  1435. LDFD f46 = [CLD2], SIZE
  1436. ;;
  1437. LDFD f43 = [CLD1], INCY3M1
  1438. LDFD f47 = [CLD2], INCY3M1
  1439. ;;
  1440. FMA f32 = ALPHA_R, f8, f32
  1441. FMA f36 = ALPHA_R, f12, f36
  1442. FMA f33 = ALPHA_I, f8, f33
  1443. FMA f37 = ALPHA_I, f12, f37
  1444. FMA f34 = ALPHA_R, f10, f34
  1445. FMA f38 = ALPHA_R, f14, f38
  1446. FMA f35 = ALPHA_I, f10, f35
  1447. FMA f39 = ALPHA_I, f14, f39
  1448. ;;
  1449. FNMA f32 = ALPHA_I, f9, f32
  1450. FNMA f36 = ALPHA_I, f13, f36
  1451. FMA f33 = ALPHA_R, f9, f33
  1452. FMA f37 = ALPHA_R, f13, f37
  1453. FNMA f34 = ALPHA_I, f11, f34
  1454. FNMA f38 = ALPHA_I, f15, f38
  1455. FMA f35 = ALPHA_R, f11, f35
  1456. FMA f39 = ALPHA_R, f15, f39
  1457. ;;
  1458. FMA f40 = ALPHA_R, f16, f40
  1459. FMA f44 = ALPHA_R, f20, f44
  1460. FMA f41 = ALPHA_I, f16, f41
  1461. FMA f45 = ALPHA_I, f20, f45
  1462. FMA f42 = ALPHA_R, f18, f42
  1463. FMA f46 = ALPHA_R, f22, f46
  1464. FMA f43 = ALPHA_I, f18, f43
  1465. FMA f47 = ALPHA_I, f22, f47
  1466. ;;
  1467. { .mmf
  1468. STFD [CST1] = f32, SIZE
  1469. STFD [CST2] = f36, SIZE
  1470. FNMA f40 = ALPHA_I, f17, f40
  1471. }
  1472. { .mmf
  1473. nop __LINE__
  1474. nop __LINE__
  1475. FNMA f44 = ALPHA_I, f21, f44
  1476. }
  1477. ;;
  1478. { .mmf
  1479. STFD [CST1] = f33
  1480. STFD [CST2] = f37
  1481. FMA f41 = ALPHA_R, f17, f41
  1482. }
  1483. { .mmf
  1484. add CST1 = CST1, INCYM1
  1485. add CST2 = CST2, INCYM1
  1486. FMA f45 = ALPHA_R, f21, f45
  1487. }
  1488. ;;
  1489. { .mmf
  1490. STFD [CST1] = f34, SIZE
  1491. STFD [CST2] = f38, SIZE
  1492. FNMA f42 = ALPHA_I, f19, f42
  1493. }
  1494. { .mmf
  1495. nop __LINE__
  1496. nop __LINE__
  1497. FNMA f46 = ALPHA_I, f23, f46
  1498. }
  1499. ;;
  1500. { .mmf
  1501. STFD [CST1] = f35
  1502. STFD [CST2] = f39
  1503. FMA f43 = ALPHA_R, f19, f43
  1504. }
  1505. { .mmf
  1506. add CST1 = CST1, INCY3M1
  1507. add CST2 = CST2, INCY3M1
  1508. FMA f47 = ALPHA_R, f23, f47
  1509. }
  1510. ;;
  1511. { .mmi
  1512. STFD [CST1] = f40, SIZE
  1513. STFD [CST2] = f44, SIZE
  1514. adds J = -1, J
  1515. }
  1516. ;;
  1517. { .mmi
  1518. STFD [CST1] = f41
  1519. STFD [CST2] = f45
  1520. add CST1 = CST1, INCYM1
  1521. }
  1522. { .mmi
  1523. nop __LINE__
  1524. nop __LINE__
  1525. add CST2 = CST2, INCYM1
  1526. }
  1527. ;;
  1528. { .mmi
  1529. STFD [CST1] = f42, SIZE
  1530. STFD [CST2] = f46, SIZE
  1531. cmp.lt p6, p0 = 0, J
  1532. }
  1533. ;;
  1534. { .mmi
  1535. STFD [CST1] = f43
  1536. STFD [CST2] = f47
  1537. add CST1 = CST1, INCY3M1
  1538. }
  1539. { .mmb
  1540. add CST2 = CST2, INCY3M1
  1541. (p6) br.cond.dptk .L111
  1542. }
  1543. ;;
  1544. .align 16
  1545. .L120:
  1546. { .mfi
  1547. mov AO1 = A
  1548. mov f8 = f0
  1549. mov pr.rot= 0
  1550. }
  1551. { .mfi
  1552. add AO2 = LDA, A
  1553. mov f10 = f0
  1554. tbit.z p6, p0 = N, 2
  1555. }
  1556. ;;
  1557. { .mmf
  1558. shladd AO3 = LDA, 1, A
  1559. shladd AO4 = LDA, 1, AO2
  1560. mov f12 = f0
  1561. }
  1562. { .mfb
  1563. mov BO = BUFFER
  1564. mov f14 = f0
  1565. (p6) br.cond.dpnt .L130
  1566. }
  1567. ;;
  1568. { .mfi
  1569. adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1
  1570. mov f9 = f0
  1571. mov ar.ec= 5
  1572. }
  1573. { .mmf
  1574. adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2
  1575. adds I = -1, M
  1576. mov f11 = f0
  1577. }
  1578. ;;
  1579. { .mmf
  1580. adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3
  1581. adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4
  1582. mov f13 = f0
  1583. }
  1584. { .mmf
  1585. cmp.eq p16, p0 = r0, r0
  1586. shladd A = LDA, 2, A
  1587. mov f15 = f0
  1588. }
  1589. ;;
  1590. { .mmi
  1591. lfetch.excl.nt1 [WPRE]
  1592. adds PREB = RPREFETCH * SIZE, BO
  1593. mov ar.lc = I
  1594. }
  1595. { .mmi
  1596. adds WPRE = 16 * SIZE, CLD1
  1597. cmp.eq p12, p0 = r0, r0
  1598. mov I = 0
  1599. }
  1600. ;;
  1601. .align 16
  1602. .L126:
  1603. { .mmf
  1604. (p12) PREFETCH [RPRE1], 16 * SIZE
  1605. (p16) LDFD f32 = [AO1], 1 * SIZE
  1606. (p20) ADD1 f8 = f116, f36, f8
  1607. }
  1608. { .mmf
  1609. (p16) cmp.eq.unc p13, p0 = 2, I
  1610. (p16) cmp.eq.unc p14, p0 = 4, I
  1611. (p20) ADD2 f9 = f121, f36, f9
  1612. }
  1613. ;;
  1614. { .mmf
  1615. (p12) PREFETCH [PREB], 16 * SIZE
  1616. (p16) LDFPD f112, f117 = [BO], 2 * SIZE
  1617. (p20) ADD1 f10 = f116, f46, f10
  1618. }
  1619. { .mmf
  1620. (p16) LDFD f37 = [AO1], 1 * SIZE
  1621. (p16) cmp.eq.unc p15, p0 = 6, I
  1622. (p20) ADD2 f11 = f121, f46, f11
  1623. }
  1624. ;;
  1625. { .mmf
  1626. (p16) LDFD f42 = [AO2], 1 * SIZE
  1627. nop __LINE__
  1628. (p20) ADD1 f12 = f116, f56, f12
  1629. }
  1630. { .mmf
  1631. nop __LINE__
  1632. nop __LINE__
  1633. (p20) ADD2 f13 = f121, f56, f13
  1634. }
  1635. ;;
  1636. { .mmf
  1637. (p13) PREFETCH [RPRE2], 16 * SIZE
  1638. (p16) LDFD f47 = [AO2], 1 * SIZE
  1639. (p20) ADD1 f14 = f116, f66, f14
  1640. }
  1641. { .mmf
  1642. nop __LINE__
  1643. nop __LINE__
  1644. (p20) ADD2 f15 = f121, f66, f15
  1645. }
  1646. ;;
  1647. { .mmf
  1648. (p16) LDFD f52 = [AO3], 1 * SIZE
  1649. nop __LINE__
  1650. (p20) ADD3 f8 = f121, f41, f8
  1651. }
  1652. { .mmf
  1653. nop __LINE__
  1654. (p16) adds I = 1, I
  1655. (p20) ADD4 f9 = f116, f41, f9
  1656. }
  1657. ;;
  1658. { .mmf
  1659. (p14) PREFETCH [RPRE3], 16 * SIZE
  1660. (p16) LDFD f57 = [AO3], 1 * SIZE
  1661. (p20) ADD3 f10 = f121, f51, f10
  1662. }
  1663. { .mmf
  1664. nop __LINE__
  1665. (p16) cmp.eq.unc p15, p0 = 8, I
  1666. (p20) ADD4 f11 = f116, f51, f11
  1667. }
  1668. ;;
  1669. { .mmf
  1670. (p16) LDFD f62 = [AO4], 1 * SIZE
  1671. nop __LINE__
  1672. (p20) ADD3 f12 = f121, f61, f12
  1673. }
  1674. { .mmf
  1675. (p15) mov I = 0
  1676. nop __LINE__
  1677. (p20) ADD4 f13 = f116, f61, f13
  1678. }
  1679. ;;
  1680. { .mmf
  1681. (p15) PREFETCH [RPRE4], 16 * SIZE
  1682. (p16) LDFD f67 = [AO4], 1 * SIZE
  1683. (p20) ADD3 f14 = f121, f71, f14
  1684. }
  1685. { .mfb
  1686. (p16) cmp.eq.unc p12, p0 = 0, I
  1687. (p20) ADD4 f15 = f116, f71, f15
  1688. br.ctop.sptk.few .L126
  1689. }
  1690. ;;
  1691. .L128:
  1692. LDFD f32 = [CLD1], SIZE
  1693. LDFD f36 = [CLD2], SIZE
  1694. shladd CST2 = INCY, 1, CST1
  1695. ;;
  1696. LDFD f33 = [CLD1], INCYM1
  1697. LDFD f37 = [CLD2], INCYM1
  1698. ;;
  1699. LDFD f34 = [CLD1], SIZE
  1700. LDFD f38 = [CLD2], SIZE
  1701. ;;
  1702. LDFD f35 = [CLD1], INCY3M1
  1703. LDFD f39 = [CLD2], INCY3M1
  1704. ;;
  1705. FMA f32 = ALPHA_R, f8, f32
  1706. FMA f36 = ALPHA_R, f12, f36
  1707. FMA f33 = ALPHA_I, f8, f33
  1708. FMA f37 = ALPHA_I, f12, f37
  1709. FMA f34 = ALPHA_R, f10, f34
  1710. FMA f38 = ALPHA_R, f14, f38
  1711. FMA f35 = ALPHA_I, f10, f35
  1712. FMA f39 = ALPHA_I, f14, f39
  1713. ;;
  1714. FNMA f32 = ALPHA_I, f9, f32
  1715. FNMA f36 = ALPHA_I, f13, f36
  1716. FMA f33 = ALPHA_R, f9, f33
  1717. FMA f37 = ALPHA_R, f13, f37
  1718. FNMA f34 = ALPHA_I, f11, f34
  1719. FNMA f38 = ALPHA_I, f15, f38
  1720. FMA f35 = ALPHA_R, f11, f35
  1721. FMA f39 = ALPHA_R, f15, f39
  1722. ;;
  1723. STFD [CST1] = f32, SIZE
  1724. STFD [CST2] = f36, SIZE
  1725. ;;
  1726. STFD [CST1] = f33
  1727. STFD [CST2] = f37
  1728. add CST1 = CST1, INCYM1
  1729. add CST2 = CST2, INCYM1
  1730. ;;
  1731. STFD [CST1] = f34, SIZE
  1732. STFD [CST2] = f38, SIZE
  1733. ;;
  1734. STFD [CST1] = f35
  1735. STFD [CST2] = f39
  1736. add CST1 = CST1, INCY3M1
  1737. add CST2 = CST2, INCY3M1
  1738. ;;
  1739. .align 16
  1740. .L130:
  1741. { .mfi
  1742. mov AO1 = A
  1743. mov f8 = f0
  1744. mov pr.rot= 0
  1745. }
  1746. { .mfi
  1747. add AO2 = LDA, A
  1748. mov f10 = f0
  1749. tbit.z p6, p0 = N, 1
  1750. }
  1751. ;;
  1752. { .mmf
  1753. adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1
  1754. adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2
  1755. mov f12 = f0
  1756. }
  1757. { .mfb
  1758. adds I = -1, M
  1759. mov f14 = f0
  1760. (p6) br.cond.dpnt .L140
  1761. }
  1762. ;;
  1763. { .mfi
  1764. mov BO = BUFFER
  1765. mov f9 = f0
  1766. mov ar.ec= 5
  1767. }
  1768. { .mmf
  1769. cmp.eq p16, p0 = r0, r0
  1770. shladd A = LDA, 1, A
  1771. mov f11 = f0
  1772. }
  1773. ;;
  1774. { .mfi
  1775. adds WPRE = 16 * SIZE, CLD1
  1776. mov f13 = f0
  1777. mov ar.lc = I
  1778. }
  1779. { .mmf
  1780. adds PREB = RPREFETCH * SIZE, BO
  1781. nop __LINE__
  1782. mov f15 = f0
  1783. }
  1784. ;;
  1785. { .mmi
  1786. lfetch.excl.nt1 [WPRE]
  1787. cmp.eq p12, p0 = r0, r0
  1788. mov I = 0
  1789. }
  1790. ;;
  1791. .align 16
  1792. .L136:
  1793. { .mmf
  1794. (p12) PREFETCH [RPRE1], 16 * SIZE
  1795. (p16) LDFD f32 = [AO1], 1 * SIZE
  1796. (p20) ADD1 f8 = f116, f36, f8
  1797. }
  1798. { .mmf
  1799. (p16) cmp.eq.unc p13, p0 = 4, I
  1800. (p16) adds I = 1, I
  1801. (p20) ADD2 f9 = f121, f36, f9
  1802. }
  1803. ;;
  1804. { .mmf
  1805. (p12) PREFETCH [PREB], 16 * SIZE
  1806. (p16) LDFPD f112, f117 = [BO], 2 * SIZE
  1807. (p20) ADD1 f10 = f116, f46, f10
  1808. }
  1809. { .mmf
  1810. (p16) LDFD f37 = [AO1], 1 * SIZE
  1811. (p16) cmp.eq.unc p12, p0 = 8, I
  1812. (p20) ADD2 f11 = f121, f46, f11
  1813. }
  1814. ;;
  1815. { .mmf
  1816. (p13) PREFETCH [RPRE2], 16 * SIZE
  1817. (p16) LDFD f42 = [AO2], 1 * SIZE
  1818. (p20) ADD3 f12 = f121, f41, f12
  1819. }
  1820. { .mmf
  1821. (p12) mov I = 0
  1822. nop __LINE__
  1823. (p20) ADD4 f13 = f116, f41, f13
  1824. }
  1825. ;;
  1826. { .mmf
  1827. (p16) LDFD f47 = [AO2], 1 * SIZE
  1828. nop __LINE__
  1829. (p20) ADD3 f14 = f121, f51, f14
  1830. }
  1831. { .mfb
  1832. nop __LINE__
  1833. (p20) ADD4 f15 = f116, f51, f15
  1834. br.ctop.sptk.few .L136
  1835. }
  1836. ;;
  1837. .L138:
  1838. LDFD f32 = [CLD1], SIZE
  1839. FADD f8 = f8, f12
  1840. shladd CST2 = INCY, 1, CST1
  1841. ;;
  1842. LDFD f33 = [CLD1], INCYM1
  1843. FADD f10 = f10, f14
  1844. ;;
  1845. LDFD f34 = [CLD1], SIZE
  1846. FADD f9 = f9, f13
  1847. ;;
  1848. LDFD f35 = [CLD1], INCYM1
  1849. FADD f11 = f11, f15
  1850. ;;
  1851. FMA f32 = ALPHA_R, f8, f32
  1852. FMA f33 = ALPHA_I, f8, f33
  1853. FMA f34 = ALPHA_R, f10, f34
  1854. FMA f35 = ALPHA_I, f10, f35
  1855. ;;
  1856. FNMA f32 = ALPHA_I, f9, f32
  1857. FMA f33 = ALPHA_R, f9, f33
  1858. FNMA f34 = ALPHA_I, f11, f34
  1859. FMA f35 = ALPHA_R, f11, f35
  1860. ;;
  1861. STFD [CST1] = f32, SIZE
  1862. ;;
  1863. STFD [CST1] = f33
  1864. add CST1 = CST1, INCYM1
  1865. ;;
  1866. STFD [CST1] = f34, SIZE
  1867. ;;
  1868. STFD [CST1] = f35
  1869. add CST1 = CST1, INCYM1
  1870. ;;
  1871. .align 16
  1872. .L140:
  1873. { .mfi
  1874. mov AO1 = A
  1875. mov f8 = f0
  1876. mov pr.rot= 0
  1877. }
  1878. { .mfi
  1879. mov f9 = f0
  1880. tbit.z p6, p0 = N, 0
  1881. }
  1882. ;;
  1883. { .mfi
  1884. adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1
  1885. mov f10 = f0
  1886. mov ar.ec= 5
  1887. }
  1888. { .mfb
  1889. adds I = -1, M
  1890. mov f11 = f0
  1891. (p6) br.cond.dpnt .L999
  1892. }
  1893. ;;
  1894. { .mmi
  1895. cmp.eq p16, p0 = r0, r0
  1896. shladd A = LDA, 1, A
  1897. mov ar.lc = I
  1898. }
  1899. { .mmi
  1900. adds WPRE = 16 * SIZE, CLD1
  1901. adds PREB = RPREFETCH * SIZE, BO
  1902. mov BO = BUFFER
  1903. }
  1904. ;;
  1905. { .mmi
  1906. lfetch.excl.nt1 [WPRE]
  1907. cmp.eq p12, p0 = r0, r0
  1908. mov I = 0
  1909. }
  1910. ;;
  1911. .align 16
  1912. .L146:
  1913. { .mmf
  1914. (p12) PREFETCH [RPRE1], 16 * SIZE
  1915. (p16) LDFD f32 = [AO1], 1 * SIZE
  1916. (p20) ADD1 f8 = f116, f36, f8
  1917. }
  1918. { .mmf
  1919. (p16) cmp.eq.unc p12, p0 = 7, I
  1920. (p16) adds I = 1, I
  1921. (p20) ADD2 f9 = f121, f36, f9
  1922. }
  1923. ;;
  1924. { .mmf
  1925. (p16) LDFPD f112, f117 = [BO], 2 * SIZE
  1926. (p16) LDFD f37 = [AO1], 1 * SIZE
  1927. (p20) ADD3 f10 = f121, f41, f10
  1928. }
  1929. { .mfb
  1930. (p12) mov I = 0
  1931. (p20) ADD4 f11 = f116, f41, f11
  1932. br.ctop.sptk.few .L146
  1933. }
  1934. ;;
  1935. .L148:
  1936. LDFD f32 = [CLD1], SIZE
  1937. FADD f8 = f8, f10
  1938. shladd CST2 = INCY, 1, CST1
  1939. ;;
  1940. LDFD f33 = [CLD1], INCYM1
  1941. FADD f9 = f9, f11
  1942. ;;
  1943. FMA f32 = ALPHA_R, f8, f32
  1944. FMA f33 = ALPHA_I, f8, f33
  1945. ;;
  1946. FNMA f32 = ALPHA_I, f9, f32
  1947. FMA f33 = ALPHA_R, f9, f33
  1948. ;;
  1949. STFD [CST1] = f32, SIZE
  1950. ;;
  1951. STFD [CST1] = f33
  1952. add CST1 = CST1, INCYM1
  1953. ;;
  1954. .align 16
  1955. .L999:
  1956. mov r8 = r0
  1957. adds r9 = 1 * 16, SP
  1958. ;;
  1959. ldf.fill f16 = [SP], 32
  1960. ldf.fill f17 = [r9], 32
  1961. mov ar.lc = ARLC
  1962. ;;
  1963. ldf.fill f18 = [SP], 32
  1964. ldf.fill f19 = [r9], 32
  1965. mov pr = PR, -1
  1966. ;;
  1967. ldf.fill f20 = [SP], 32
  1968. ldf.fill f21 = [r9], 32
  1969. mov ar.pfs = ARPFS
  1970. ;;
  1971. ldf.fill f22 = [SP], 32
  1972. ldf.fill f23 = [r9]
  1973. br.ret.sptk.many b0
  1974. ;;
  1975. EPILOGUE