You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t.S 59 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define P 4096
  41. #define SP r12
  42. #define M r32
  43. #define N r33
  44. #define A r36
  45. #define LDA r37
  46. #define X r38
  47. #define INCX r39
  48. #define Y r34
  49. #define INCY r35
  50. #define BUFFER r11
  51. #define MIN_M r14
  52. #define I r15
  53. #define J r16
  54. #define IS r17
  55. #define AO1 r18
  56. #define AO2 r19
  57. #define AO3 r20
  58. #define AO4 r21
  59. #define AO5 r22
  60. #define AO6 r23
  61. #define AO7 r24
  62. #define AO8 r25
  63. #define BO r26
  64. #define LDAP r27
  65. #define RPRE1 loc0
  66. #define RPRE2 loc1
  67. #define RPRE3 loc2
  68. #define RPRE4 loc3
  69. #define RPRE5 loc4
  70. #define RPRE6 loc5
  71. #define RPRE7 loc6
  72. #define RPRE8 loc7
  73. #define AO21 loc8
  74. #define AO41 loc9
  75. #define AO61 loc10
  76. #define AO81 loc11
  77. #define PREB r8
  78. #define WPRE r9
  79. #define OFFSET PREB
  80. #define CO r10
  81. #define ARLC r29
  82. #define PR r30
  83. #define ARPFS r31
  84. #ifdef DOUBLE
  85. #define RPREFETCH (16 * 3 + 8)
  86. #else
  87. #define RPREFETCH (16 * 3 + 16)
  88. #endif
  89. #define PREFETCH lfetch.nt1
  90. #define ALPHA f6
  91. PROLOGUE
  92. .prologue
  93. PROFCODE
  94. { .mmi
  95. .save ar.pfs, ARPFS
  96. alloc ARPFS = ar.pfs, 8, 16, 8, 0
  97. setf.sig f11 = LDA
  98. mov ARLC = ar.lc
  99. }
  100. { .mmi
  101. adds r15 = 24, SP
  102. adds r16 = 32, SP
  103. adds r14 = 16, SP
  104. }
  105. ;;
  106. { .mmi
  107. setf.sig f10 = N
  108. ld8 Y = [r14]
  109. mov PR = pr
  110. }
  111. { .mmi
  112. ld8 INCY = [r15]
  113. adds r8 = -8 * 16, SP
  114. adds r9 = -7 * 16, SP
  115. }
  116. ;;
  117. { .mmi
  118. stf.spill [r8] = f16, 32
  119. stf.spill [r9] = f17, 32
  120. adds SP = -8 * 16, SP
  121. }
  122. ;;
  123. { .mmf
  124. stf.spill [r8] = f18, 32
  125. stf.spill [r9] = f19, 32
  126. mov ALPHA = f8
  127. }
  128. ;;
  129. { .mmi
  130. stf.spill [r8] = f20, 32
  131. stf.spill [r9] = f21, 32
  132. mov IS = 0
  133. }
  134. ;;
  135. { .mmf
  136. stf.spill [r8] = f22
  137. stf.spill [r9] = f23
  138. xmpy.l f10 = f10, f11
  139. }
  140. .body
  141. ;;
  142. ;;
  143. { .mmi
  144. ld8 BUFFER = [r16]
  145. cmp.ge p7, p0 = r0, M
  146. cmp.ge p6, p0 = r0, N
  147. }
  148. ;;
  149. { .mmi
  150. shladd INCX = INCX, BASE_SHIFT, r0
  151. shladd LDA = LDA, BASE_SHIFT, r0
  152. shladd INCY = INCY, BASE_SHIFT, r0
  153. }
  154. ;;
  155. { .mmi
  156. getf.sig LDAP = f10
  157. mov r2 = P
  158. tbit.nz p8, p0 = A, BASE_SHIFT
  159. }
  160. { .mmi
  161. nop __LINE__
  162. nop __LINE__
  163. tbit.nz p9, p0 = LDA, BASE_SHIFT
  164. }
  165. ;;
  166. { .mbb
  167. sub LDAP = r2, LDAP
  168. (p7) br.cond.dpnt .L999
  169. (p6) br.cond.dpnt .L999
  170. }
  171. .align 16
  172. ;;
  173. .LIs_loop:
  174. { .mmi
  175. sub MIN_M = M, IS
  176. (p8) LDFD f32 = [X], INCX
  177. mov pr.rot= 0
  178. }
  179. { .mmi
  180. mov AO1 = BUFFER
  181. adds AO2 = 4 * SIZE, BUFFER
  182. }
  183. ;;
  184. cmp.le p6, p0 = r2, MIN_M
  185. ;;
  186. (p6) mov MIN_M = P
  187. ;;
  188. (p8) adds MIN_M = -1, MIN_M
  189. ;;
  190. { .mmi
  191. shladd OFFSET = INCX, 2, INCX
  192. shladd BO = INCX, 2, X
  193. shr I = MIN_M, 3
  194. }
  195. ;;
  196. { .mmi
  197. adds I = -1, I
  198. cmp.eq p16, p0 = r0, r0
  199. mov ar.ec= 5
  200. }
  201. ;;
  202. { .mmi
  203. (p8) STFD [AO1] = f32, 2 * SIZE
  204. (p8) adds AO2 = 6 * SIZE, BUFFER
  205. mov ar.lc = I
  206. }
  207. { .mib
  208. cmp.gt p6, p0 = 0, I
  209. tbit.nz p13, p0 = MIN_M, 2
  210. (p6) br.cond.dpnt .L05
  211. }
  212. ;;
  213. .align 16
  214. .L01:
  215. (p20) STFD [AO1] = f36, SIZE
  216. (p20) STFD [AO2] = f56, SIZE
  217. (p16) LDFD f32 = [X], INCX
  218. (p16) LDFD f52 = [BO], INCX
  219. ;;
  220. (p20) STFD [AO1] = f41, SIZE
  221. (p20) STFD [AO2] = f61, SIZE
  222. (p16) LDFD f37 = [X], INCX
  223. (p16) LDFD f57 = [BO], INCX
  224. ;;
  225. (p20) STFD [AO1] = f46, SIZE
  226. (p20) STFD [AO2] = f66, SIZE
  227. (p16) LDFD f42 = [X], INCX
  228. (p16) LDFD f62 = [BO], INCX
  229. ;;
  230. (p20) STFD [AO1] = f51, 5 * SIZE
  231. (p20) STFD [AO2] = f71, 5 * SIZE
  232. (p16) LDFD f47 = [X], OFFSET
  233. (p16) LDFD f67 = [BO], OFFSET
  234. br.ctop.sptk.few .L01
  235. ;;
  236. .align 16
  237. .L05:
  238. (p13) LDFD f32 = [X], INCX
  239. tbit.nz p14, p0 = MIN_M, 1
  240. ;;
  241. (p13) LDFD f33 = [X], INCX
  242. tbit.nz p15, p0 = MIN_M, 0
  243. ;;
  244. (p13) LDFD f34 = [X], INCX
  245. ;;
  246. (p13) LDFD f35 = [X], INCX
  247. ;;
  248. (p14) LDFD f36 = [X], INCX
  249. ;;
  250. (p13) STFD [AO1] = f32, SIZE
  251. (p14) LDFD f37 = [X], INCX
  252. ;;
  253. (p13) STFD [AO1] = f33, SIZE
  254. (p15) LDFD f38 = [X], INCX
  255. ;;
  256. (p13) STFD [AO1] = f34, SIZE
  257. ;;
  258. (p13) STFD [AO1] = f35, SIZE
  259. ;;
  260. (p14) STFD [AO1] = f36, SIZE
  261. ;;
  262. (p14) STFD [AO1] = f37, SIZE
  263. ;;
  264. (p15) STFD [AO1] = f38, SIZE
  265. (p9) br.cond.dpnt .L100
  266. ;;
  267. .align 16
  268. .L10:
  269. { .mmi
  270. mov CO = Y
  271. nop __LINE__
  272. shr J = N, 3
  273. }
  274. ;;
  275. { .mib
  276. nop __LINE__
  277. cmp.eq p6, p0 = r0, J
  278. (p6) br.cond.dpnt .L20
  279. }
  280. ;;
  281. .align 16
  282. .L11:
  283. { .mfi
  284. mov AO1 = A
  285. mov f8 = f0
  286. mov pr.rot= 0
  287. }
  288. { .mfi
  289. add AO2 = LDA, A
  290. mov f10 = f0
  291. shr I = MIN_M, 4
  292. }
  293. ;;
  294. { .mmf
  295. shladd AO3 = LDA, 1, A
  296. shladd AO4 = LDA, 1, AO2
  297. mov f12 = f0
  298. }
  299. { .mmf
  300. (p8) LDFD f32 = [AO1], SIZE
  301. (p8) LDFD f33 = [AO2], SIZE
  302. mov f14 = f0
  303. }
  304. ;;
  305. { .mmf
  306. shladd AO5 = LDA, 1, AO3
  307. shladd AO6 = LDA, 1, AO4
  308. mov f16 = f0
  309. }
  310. { .mmf
  311. (p8) LDFD f34 = [AO3], SIZE
  312. (p8) LDFD f35 = [AO4], SIZE
  313. mov f18 = f0
  314. }
  315. ;;
  316. { .mmf
  317. shladd AO7 = LDA, 1, AO5
  318. shladd AO8 = LDA, 1, AO6
  319. mov f20 = f0
  320. }
  321. { .mmf
  322. (p8) LDFD f36 = [AO5], SIZE
  323. (p8) LDFD f37 = [AO6], SIZE
  324. mov f22 = f0
  325. }
  326. ;;
  327. { .mfi
  328. (p8) LDFD f38 = [AO7], SIZE
  329. mov f9 = f0
  330. mov ar.ec= 2
  331. }
  332. { .mmf
  333. (p8) LDFD f39 = [AO8], SIZE
  334. mov BO = BUFFER
  335. mov f11 = f0
  336. }
  337. ;;
  338. { .mmf
  339. (p8) LDFD f40 = [BO], 2 * SIZE
  340. cmp.eq p6, p0 = 0, I
  341. mov f13 = f0
  342. }
  343. { .mmf
  344. shladd A = LDA, 3, A
  345. cmp.eq p16, p0 = r0, r0
  346. mov f15 = f0
  347. }
  348. ;;
  349. { .mmf
  350. add I = I, I
  351. nop __LINE__
  352. mov f17 = f0
  353. }
  354. { .mmf
  355. adds RPRE1 = RPREFETCH * SIZE, AO1
  356. adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2
  357. mov f19 = f0
  358. }
  359. ;;
  360. { .mmf
  361. adds I = -1, I
  362. nop __LINE__
  363. mov f21 = f0
  364. }
  365. { .mmf
  366. adds RPRE3 = RPREFETCH * SIZE, AO3
  367. adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4
  368. mov f23 = f0
  369. }
  370. ;;
  371. { .mmf
  372. nop __LINE__
  373. nop __LINE__
  374. (p8) FMPY f8 = f40, f32
  375. }
  376. { .mmf
  377. adds RPRE5 = RPREFETCH * SIZE, AO5
  378. adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6
  379. (p8) FMPY f10 = f40, f33
  380. }
  381. ;;
  382. { .mmf
  383. nop __LINE__
  384. nop __LINE__
  385. (p8) FMPY f12 = f40, f34
  386. }
  387. { .mmf
  388. adds RPRE7 = RPREFETCH * SIZE, AO7
  389. adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8
  390. (p8) FMPY f14 = f40, f35
  391. }
  392. ;;
  393. { .mfi
  394. nop __LINE__
  395. (p8) FMPY f16 = f40, f36
  396. mov ar.lc = I
  397. }
  398. { .mmf
  399. adds WPRE = 8 * SIZE, CO
  400. adds PREB = RPREFETCH * SIZE, BO
  401. (p8) FMPY f18 = f40, f37
  402. }
  403. ;;
  404. { .mmf
  405. lfetch.excl.nt1 [WPRE]
  406. nop __LINE__
  407. (p8) FMPY f20 = f40, f38
  408. }
  409. { .mfb
  410. nop __LINE__
  411. (p8) FMPY f22 = f40, f39
  412. (p6) br.cond.dpnt .L15
  413. }
  414. ;;
  415. .align 16
  416. .L12:
  417. { .mfi
  418. (p17) LDFPD f95, f96 = [AO8], 2 * SIZE
  419. (p17) FMA f8 = f104, f33, f8
  420. (p16) tbit.nz.unc p14, p15 = I, 0
  421. }
  422. { .mfi
  423. (p17) LDFPD f110, f111 = [BO], 2 * SIZE
  424. (p17) FMA f9 = f105, f34, f9
  425. nop __LINE__
  426. }
  427. ;;
  428. { .mfi
  429. (p16) LDFPD f32, f33 = [AO1], 2 * SIZE
  430. (p17) FMA f10 = f104, f35, f10
  431. nop __LINE__
  432. }
  433. { .mfi
  434. (p14) PREFETCH [RPRE1], 16 * SIZE
  435. (p17) FMA f11 = f105, f36, f11
  436. nop __LINE__
  437. }
  438. ;;
  439. { .mfi
  440. (p16) LDFPD f34, f35 = [AO2], 2 * SIZE
  441. (p17) FMA f12 = f104, f37, f12
  442. nop __LINE__
  443. }
  444. { .mfi
  445. (p15) PREFETCH [RPRE2], 16 * SIZE
  446. (p17) FMA f13 = f105, f38, f13
  447. nop __LINE__
  448. }
  449. ;;
  450. { .mfi
  451. (p16) LDFPD f36, f37 = [AO3], 2 * SIZE
  452. (p17) FMA f14 = f104, f39, f14
  453. nop __LINE__
  454. }
  455. { .mfi
  456. (p14) PREFETCH [RPRE3], 16 * SIZE
  457. (p17) FMA f15 = f105, f40, f15
  458. nop __LINE__
  459. }
  460. ;;
  461. { .mfi
  462. (p16) LDFPD f38, f39 = [AO4], 2 * SIZE
  463. (p17) FMA f16 = f104, f41, f16
  464. nop __LINE__
  465. }
  466. { .mfi
  467. (p15) PREFETCH [RPRE4], 16 * SIZE
  468. (p17) FMA f17 = f105, f42, f17
  469. nop __LINE__
  470. }
  471. ;;
  472. { .mfi
  473. (p16) LDFPD f40, f41 = [AO5], 2 * SIZE
  474. (p17) FMA f18 = f104, f43, f18
  475. nop __LINE__
  476. }
  477. { .mfi
  478. (p14) PREFETCH [RPRE5], 16 * SIZE
  479. (p17) FMA f19 = f105, f44, f19
  480. nop __LINE__
  481. }
  482. ;;
  483. { .mfi
  484. (p16) LDFPD f42, f43 = [AO6], 2 * SIZE
  485. (p17) FMA f20 = f104, f45, f20
  486. nop __LINE__
  487. }
  488. { .mfi
  489. (p15) PREFETCH [RPRE6], 16 * SIZE
  490. (p17) FMA f21 = f105, f46, f21
  491. nop __LINE__
  492. }
  493. ;;
  494. { .mfi
  495. (p16) LDFPD f44, f45 = [AO7], 2 * SIZE
  496. (p17) FMA f22 = f104, f47, f22
  497. nop __LINE__
  498. }
  499. { .mfi
  500. (p14) PREFETCH [RPRE7], 16 * SIZE
  501. (p17) FMA f23 = f105, f48, f23
  502. nop __LINE__
  503. }
  504. ;;
  505. { .mfi
  506. (p16) LDFPD f46, f47 = [AO8], 2 * SIZE
  507. (p17) FMA f8 = f106, f49, f8
  508. nop __LINE__
  509. }
  510. { .mfi
  511. (p15) PREFETCH [RPRE8], 16 * SIZE
  512. (p17) FMA f9 = f107, f50, f9
  513. nop __LINE__
  514. }
  515. ;;
  516. { .mfi
  517. (p16) LDFPD f48, f49 = [AO1], 2 * SIZE
  518. (p17) FMA f10 = f106, f51, f10
  519. nop __LINE__
  520. }
  521. { .mfi
  522. (p14) PREFETCH [PREB], 16 * SIZE
  523. (p17) FMA f11 = f107, f52, f11
  524. nop __LINE__
  525. }
  526. ;;
  527. { .mfi
  528. (p16) LDFPD f50, f51 = [AO2], 2 * SIZE
  529. (p17) FMA f12 = f106, f53, f12
  530. nop __LINE__
  531. }
  532. { .mfi
  533. (p16) LDFPD f103, f104 = [BO], 2 * SIZE
  534. (p17) FMA f13 = f107, f54, f13
  535. nop __LINE__
  536. }
  537. ;;
  538. { .mfi
  539. (p16) LDFPD f52, f53 = [AO3], 2 * SIZE
  540. (p17) FMA f14 = f106, f55, f14
  541. nop __LINE__
  542. }
  543. { .mfi
  544. nop __LINE__
  545. (p17) FMA f15 = f107, f56, f15
  546. nop __LINE__
  547. }
  548. ;;
  549. { .mfi
  550. (p16) LDFPD f54, f55 = [AO4], 2 * SIZE
  551. (p17) FMA f16 = f106, f57, f16
  552. nop __LINE__
  553. }
  554. { .mfi
  555. nop __LINE__
  556. (p17) FMA f17 = f107, f58, f17
  557. nop __LINE__
  558. }
  559. ;;
  560. { .mfi
  561. (p16) LDFPD f56, f57 = [AO5], 2 * SIZE
  562. (p17) FMA f18 = f106, f59, f18
  563. nop __LINE__
  564. }
  565. { .mfi
  566. nop __LINE__
  567. (p17) FMA f19 = f107, f60, f19
  568. nop __LINE__
  569. }
  570. ;;
  571. { .mfi
  572. (p16) LDFPD f58, f59 = [AO6], 2 * SIZE
  573. (p17) FMA f20 = f106, f61, f20
  574. nop __LINE__
  575. }
  576. { .mfi
  577. nop __LINE__
  578. (p17) FMA f21 = f107, f62, f21
  579. nop __LINE__
  580. }
  581. ;;
  582. { .mfi
  583. (p16) LDFPD f60, f61 = [AO7], 2 * SIZE
  584. (p17) FMA f22 = f106, f63, f22
  585. nop __LINE__
  586. }
  587. { .mfi
  588. nop __LINE__
  589. (p17) FMA f23 = f107, f64, f23
  590. nop __LINE__
  591. }
  592. ;;
  593. { .mfi
  594. (p16) LDFPD f62, f63 = [AO8], 2 * SIZE
  595. (p17) FMA f8 = f108, f65, f8
  596. nop __LINE__
  597. }
  598. { .mfi
  599. (p16) LDFPD f105, f106 = [BO], 2 * SIZE
  600. (p17) FMA f9 = f109, f66, f9
  601. nop __LINE__
  602. }
  603. ;;
  604. { .mfi
  605. (p16) LDFPD f64, f65 = [AO1], 2 * SIZE
  606. (p17) FMA f10 = f108, f67, f10
  607. nop __LINE__
  608. }
  609. { .mfi
  610. nop __LINE__
  611. (p17) FMA f11 = f109, f68, f11
  612. nop __LINE__
  613. }
  614. ;;
  615. { .mfi
  616. (p16) LDFPD f66, f67 = [AO2], 2 * SIZE
  617. (p17) FMA f12 = f108, f69, f12
  618. nop __LINE__
  619. }
  620. { .mfi
  621. nop __LINE__
  622. (p17) FMA f13 = f109, f70, f13
  623. nop __LINE__
  624. }
  625. ;;
  626. { .mfi
  627. (p16) LDFPD f68, f69 = [AO3], 2 * SIZE
  628. (p17) FMA f14 = f108, f71, f14
  629. nop __LINE__
  630. }
  631. { .mfi
  632. nop __LINE__
  633. (p17) FMA f15 = f109, f72, f15
  634. nop __LINE__
  635. }
  636. ;;
  637. { .mfi
  638. (p16) LDFPD f70, f71 = [AO4], 2 * SIZE
  639. (p17) FMA f16 = f108, f73, f16
  640. nop __LINE__
  641. }
  642. { .mfi
  643. nop __LINE__
  644. (p17) FMA f17 = f109, f74, f17
  645. nop __LINE__
  646. }
  647. ;;
  648. { .mfi
  649. (p16) LDFPD f72, f73 = [AO5], 2 * SIZE
  650. (p17) FMA f18 = f108, f75, f18
  651. nop __LINE__
  652. }
  653. { .mfi
  654. nop __LINE__
  655. (p17) FMA f19 = f109, f76, f19
  656. nop __LINE__
  657. }
  658. ;;
  659. { .mfi
  660. (p16) LDFPD f74, f75 = [AO6], 2 * SIZE
  661. (p17) FMA f20 = f108, f77, f20
  662. nop __LINE__
  663. }
  664. { .mfi
  665. nop __LINE__
  666. (p17) FMA f21 = f109, f78, f21
  667. nop __LINE__
  668. }
  669. ;;
  670. { .mfi
  671. (p16) LDFPD f76, f77 = [AO7], 2 * SIZE
  672. (p17) FMA f22 = f108, f79, f22
  673. nop __LINE__
  674. }
  675. { .mfi
  676. nop __LINE__
  677. (p17) FMA f23 = f109, f80, f23
  678. nop __LINE__
  679. }
  680. ;;
  681. { .mfi
  682. (p16) LDFPD f107, f108 = [BO], 2 * SIZE
  683. (p17) FMA f8 = f110, f81, f8
  684. nop __LINE__
  685. }
  686. { .mfi
  687. (p16) LDFPD f78, f79 = [AO8], 2 * SIZE
  688. (p17) FMA f9 = f111, f82, f9
  689. nop __LINE__
  690. }
  691. ;;
  692. { .mfi
  693. (p16) LDFPD f80, f81 = [AO1], 2 * SIZE
  694. (p17) FMA f10 = f110, f83, f10
  695. nop __LINE__
  696. }
  697. { .mfi
  698. nop __LINE__
  699. (p17) FMA f11 = f111, f84, f11
  700. nop __LINE__
  701. }
  702. ;;
  703. { .mfi
  704. (p16) LDFPD f82, f83 = [AO2], 2 * SIZE
  705. (p17) FMA f12 = f110, f85, f12
  706. nop __LINE__
  707. }
  708. { .mfi
  709. nop __LINE__
  710. (p17) FMA f13 = f111, f86, f13
  711. nop __LINE__
  712. }
  713. ;;
  714. { .mfi
  715. (p16) LDFPD f84, f85 = [AO3], 2 * SIZE
  716. (p17) FMA f14 = f110, f87, f14
  717. nop __LINE__
  718. }
  719. { .mfi
  720. nop __LINE__
  721. (p17) FMA f15 = f111, f88, f15
  722. nop __LINE__
  723. }
  724. ;;
  725. { .mfi
  726. (p16) LDFPD f86, f87 = [AO4], 2 * SIZE
  727. (p17) FMA f16 = f110, f89, f16
  728. nop __LINE__
  729. }
  730. { .mfi
  731. nop __LINE__
  732. (p17) FMA f17 = f111, f90, f17
  733. nop __LINE__
  734. }
  735. ;;
  736. { .mfi
  737. (p16) LDFPD f88, f89 = [AO5], 2 * SIZE
  738. (p17) FMA f18 = f110, f91, f18
  739. nop __LINE__
  740. }
  741. { .mfi
  742. nop __LINE__
  743. (p17) FMA f19 = f111, f92, f19
  744. nop __LINE__
  745. }
  746. ;;
  747. { .mfi
  748. (p16) LDFPD f90, f91 = [AO6], 2 * SIZE
  749. (p17) FMA f20 = f110, f93, f20
  750. nop __LINE__
  751. }
  752. { .mfi
  753. nop __LINE__
  754. (p17) FMA f21 = f111, f94, f21
  755. nop __LINE__
  756. }
  757. ;;
  758. { .mfi
  759. (p16) LDFPD f92, f93 = [AO7], 2 * SIZE
  760. (p17) FMA f22 = f110, f95, f22
  761. nop __LINE__
  762. }
  763. { .mfb
  764. adds I = -1, I
  765. (p17) FMA f23 = f111, f96, f23
  766. br.ctop.sptk.few .L12
  767. }
  768. ;;
  769. .align 16
  770. .L15:
  771. and I = 15, MIN_M
  772. mov pr.rot= 0
  773. ;;
  774. cmp.eq p6, p0 = 0, I
  775. cmp.eq p16, p15 = r0, r0
  776. ;;
  777. adds I = 1, I
  778. ;;
  779. shr I = I, 1
  780. ;;
  781. adds I = -1, I
  782. ;;
  783. mov ar.lc = I
  784. mov ar.ec= 3
  785. and I = 15, MIN_M
  786. (p6) br.cond.dpnt .L18
  787. ;;
  788. .align 16
  789. .L16:
  790. { .mfi
  791. (p16) LDFPD f104, f107 = [BO], 2 * SIZE
  792. (p18) FMA f8 = f106, f34, f8
  793. nop __LINE__
  794. }
  795. { .mfi
  796. (p16) LDFPD f32, f35 = [AO1], 2 * SIZE
  797. (p15) FMA f9 = f109, f37, f9
  798. nop __LINE__
  799. }
  800. ;;
  801. { .mfi
  802. (p16) LDFPD f38, f41 = [AO2], 2 * SIZE
  803. (p18) FMA f10 = f106, f40, f10
  804. nop __LINE__
  805. }
  806. { .mfi
  807. nop __LINE__
  808. (p15) FMA f11 = f109, f43, f11
  809. nop __LINE__
  810. }
  811. ;;
  812. { .mfi
  813. (p16) LDFPD f44, f47 = [AO3], 2 * SIZE
  814. (p18) FMA f12 = f106, f46, f12
  815. nop __LINE__
  816. }
  817. { .mfi
  818. nop __LINE__
  819. (p15) FMA f13 = f109, f49, f13
  820. nop __LINE__
  821. }
  822. ;;
  823. { .mfi
  824. (p16) LDFPD f50, f53 = [AO4], 2 * SIZE
  825. (p18) FMA f14 = f106, f52, f14
  826. nop __LINE__
  827. }
  828. { .mfi
  829. nop __LINE__
  830. (p15) FMA f15 = f109, f55, f15
  831. nop __LINE__
  832. }
  833. ;;
  834. { .mfi
  835. (p16) LDFPD f56, f59 = [AO5], 2 * SIZE
  836. (p18) FMA f16 = f106, f58, f16
  837. nop __LINE__
  838. }
  839. { .mfi
  840. nop __LINE__
  841. (p15) FMA f17 = f109, f61, f17
  842. nop __LINE__
  843. }
  844. ;;
  845. { .mfi
  846. (p16) LDFPD f62, f65 = [AO6], 2 * SIZE
  847. (p18) FMA f18 = f106, f64, f18
  848. nop __LINE__
  849. }
  850. { .mfi
  851. nop __LINE__
  852. (p15) FMA f19 = f109, f67, f19
  853. (p17) adds I = -2, I
  854. }
  855. ;;
  856. { .mfi
  857. (p16) LDFPD f68, f71 = [AO7], 2 * SIZE
  858. (p18) FMA f20 = f106, f70, f20
  859. nop __LINE__
  860. }
  861. { .mfi
  862. nop __LINE__
  863. (p15) FMA f21 = f109, f73, f21
  864. nop __LINE__
  865. }
  866. ;;
  867. { .mfi
  868. (p16) LDFPD f74, f77 = [AO8], 2 * SIZE
  869. (p15) FMA f23 = f109, f79, f23
  870. (p17) cmp.ne.unc p15, p0 = -1, I
  871. }
  872. { .mfb
  873. nop __LINE__
  874. (p18) FMA f22 = f106, f76, f22
  875. br.ctop.sptk.few .L16
  876. }
  877. ;;
  878. .L18:
  879. { .mmf
  880. mov AO1 = CO
  881. LDFD f32 = [CO], INCY
  882. FADD f8 = f8, f9
  883. }
  884. ;;
  885. { .mmf
  886. LDFD f33 = [CO], INCY
  887. nop __LINE__
  888. FADD f10 = f10, f11
  889. }
  890. ;;
  891. { .mmf
  892. LDFD f34 = [CO], INCY
  893. nop __LINE__
  894. FADD f12 = f12, f13
  895. }
  896. ;;
  897. { .mmf
  898. LDFD f35 = [CO], INCY
  899. nop __LINE__
  900. FADD f14 = f14, f15
  901. }
  902. ;;
  903. { .mmf
  904. LDFD f36 = [CO], INCY
  905. nop __LINE__
  906. FADD f16 = f16, f17
  907. }
  908. ;;
  909. { .mmf
  910. LDFD f37 = [CO], INCY
  911. nop __LINE__
  912. FADD f18 = f18, f19
  913. }
  914. ;;
  915. { .mmf
  916. LDFD f38 = [CO], INCY
  917. nop __LINE__
  918. FADD f20 = f20, f21
  919. }
  920. ;;
  921. { .mmf
  922. LDFD f39 = [CO], INCY
  923. nop __LINE__
  924. FADD f22 = f22, f23
  925. }
  926. ;;
  927. { .mmf
  928. nop __LINE__
  929. nop __LINE__
  930. FMA f32 = ALPHA, f8, f32
  931. }
  932. { .mmf
  933. nop __LINE__
  934. nop __LINE__
  935. FMA f33 = ALPHA, f10, f33
  936. }
  937. { .mmf
  938. nop __LINE__
  939. nop __LINE__
  940. FMA f34 = ALPHA, f12, f34
  941. }
  942. { .mmf
  943. nop __LINE__
  944. nop __LINE__
  945. FMA f35 = ALPHA, f14, f35
  946. }
  947. ;;
  948. { .mmf
  949. STFD [AO1] = f32
  950. add AO1 = AO1, INCY
  951. FMA f36 = ALPHA, f16, f36
  952. }
  953. ;;
  954. { .mmf
  955. STFD [AO1] = f33
  956. add AO1 = AO1, INCY
  957. FMA f37 = ALPHA, f18, f37
  958. }
  959. ;;
  960. { .mmf
  961. STFD [AO1] = f34
  962. add AO1 = AO1, INCY
  963. FMA f38 = ALPHA, f20, f38
  964. }
  965. ;;
  966. { .mmf
  967. STFD [AO1] = f35
  968. add AO1 = AO1, INCY
  969. FMA f39 = ALPHA, f22, f39
  970. }
  971. ;;
  972. { .mmi
  973. STFD [AO1] = f36
  974. add AO1 = AO1, INCY
  975. adds J = -1, J
  976. }
  977. ;;
  978. { .mmi
  979. STFD [AO1] = f37
  980. add AO1 = AO1, INCY
  981. nop __LINE__
  982. }
  983. ;;
  984. { .mmi
  985. STFD [AO1] = f38
  986. add AO1 = AO1, INCY
  987. cmp4.lt p6, p0 = 0, J
  988. }
  989. ;;
  990. { .mib
  991. STFD [AO1] = f39
  992. add AO1 = AO1, INCY
  993. (p6) br.cond.dptk .L11
  994. }
  995. ;;
  996. .align 16
  997. .L20:
  998. { .mfi
  999. mov AO1 = A
  1000. mov f8 = f0
  1001. mov pr.rot= 0
  1002. }
  1003. { .mfi
  1004. add AO2 = LDA, A
  1005. mov f10 = f0
  1006. tbit.z p6, p0 = N, 2
  1007. }
  1008. ;;
  1009. { .mfi
  1010. shladd AO3 = LDA, 1, A
  1011. mov f12 = f0
  1012. shr I = MIN_M, 4
  1013. }
  1014. { .mfb
  1015. shladd AO4 = LDA, 1, AO2
  1016. mov f14 = f0
  1017. (p6) br.cond.dpnt .L30
  1018. }
  1019. ;;
  1020. { .mmf
  1021. (p8) LDFD f32 = [AO1], SIZE
  1022. (p8) LDFD f33 = [AO2], SIZE
  1023. mov f9 = f0
  1024. }
  1025. { .mmf
  1026. mov BO = BUFFER
  1027. shladd A = LDA, 2, A
  1028. mov f11 = f0
  1029. }
  1030. ;;
  1031. { .mmf
  1032. (p8) LDFD f40 = [BO], 2 * SIZE
  1033. cmp.eq p6, p0 = 0, I
  1034. mov f13 = f0
  1035. }
  1036. { .mmf
  1037. (p8) LDFD f34 = [AO3], SIZE
  1038. (p8) LDFD f35 = [AO4], SIZE
  1039. mov f15 = f0
  1040. }
  1041. ;;
  1042. { .mmi
  1043. adds RPRE1 = RPREFETCH * SIZE, AO1
  1044. adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2
  1045. mov ar.ec= 2
  1046. }
  1047. { .mmi
  1048. cmp.eq p16, p0 = r0, r0
  1049. add I = I, I
  1050. }
  1051. ;;
  1052. { .mmf
  1053. adds WPRE = 4 * SIZE, CO
  1054. adds PREB = RPREFETCH * SIZE, BO
  1055. (p8) FMPY f8 = f40, f32
  1056. }
  1057. { .mmf
  1058. adds RPRE3 = RPREFETCH * SIZE, AO3
  1059. adds I = -1, I
  1060. (p8) FMPY f10 = f40, f33
  1061. }
  1062. ;;
  1063. { .mfi
  1064. lfetch.excl.nt1 [WPRE]
  1065. (p8) FMPY f12 = f40, f34
  1066. mov ar.lc = I
  1067. }
  1068. { .mfb
  1069. adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4
  1070. (p8) FMPY f14 = f40, f35
  1071. (p6) br.cond.dpnt .L25
  1072. }
  1073. ;;
  1074. .align 16
  1075. .L22:
  1076. { .mmf
  1077. (p17) LDFPD f87, f88 = [AO4], 2 * SIZE
  1078. (p17) LDFPD f110, f111 = [BO], 2 * SIZE
  1079. (p17) FMA f8 = f104, f33, f8
  1080. }
  1081. { .mfi
  1082. nop __LINE__
  1083. (p17) FMA f9 = f105, f34, f9
  1084. (p16) tbit.nz.unc p14, p15 = I, 0
  1085. }
  1086. ;;
  1087. { .mmf
  1088. (p14) PREFETCH [RPRE1], 16 * SIZE
  1089. (p16) LDFPD f32, f33 = [AO1], 2 * SIZE
  1090. (p17) FMA f10 = f104, f35, f10
  1091. }
  1092. { .mmf
  1093. nop __LINE__
  1094. nop __LINE__
  1095. (p17) FMA f11 = f105, f36, f11
  1096. }
  1097. ;;
  1098. { .mmf
  1099. (p15) PREFETCH [RPRE2], 16 * SIZE
  1100. (p16) LDFPD f34, f35 = [AO2], 2 * SIZE
  1101. (p17) FMA f12 = f104, f37, f12
  1102. }
  1103. { .mmf
  1104. nop __LINE__
  1105. nop __LINE__
  1106. (p17) FMA f13 = f105, f38, f13
  1107. }
  1108. ;;
  1109. { .mmf
  1110. (p14) PREFETCH [RPRE3], 16 * SIZE
  1111. (p16) LDFPD f36, f37 = [AO3], 2 * SIZE
  1112. (p17) FMA f14 = f104, f39, f14
  1113. }
  1114. { .mmf
  1115. nop __LINE__
  1116. nop __LINE__
  1117. (p17) FMA f15 = f105, f40, f15
  1118. }
  1119. ;;
  1120. { .mmf
  1121. (p15) PREFETCH [RPRE4], 16 * SIZE
  1122. (p16) LDFPD f38, f39 = [AO4], 2 * SIZE
  1123. (p17) FMA f8 = f106, f49, f8
  1124. }
  1125. { .mmf
  1126. nop __LINE__
  1127. nop __LINE__
  1128. (p17) FMA f9 = f107, f50, f9
  1129. }
  1130. ;;
  1131. { .mmf
  1132. (p14) PREFETCH [PREB], 16 * SIZE
  1133. (p16) LDFPD f48, f49 = [AO1], 2 * SIZE
  1134. (p17) FMA f10 = f106, f51, f10
  1135. }
  1136. { .mmf
  1137. nop __LINE__
  1138. nop __LINE__
  1139. (p17) FMA f11 = f107, f52, f11
  1140. }
  1141. ;;
  1142. { .mmf
  1143. (p16) LDFPD f50, f51 = [AO2], 2 * SIZE
  1144. (p16) LDFPD f103, f104 = [BO], 2 * SIZE
  1145. (p17) FMA f12 = f106, f53, f12
  1146. }
  1147. { .mmf
  1148. nop __LINE__
  1149. nop __LINE__
  1150. (p17) FMA f13 = f107, f54, f13
  1151. }
  1152. ;;
  1153. { .mmf
  1154. (p16) LDFPD f52, f53 = [AO3], 2 * SIZE
  1155. nop __LINE__
  1156. (p17) FMA f14 = f106, f55, f14
  1157. }
  1158. { .mmf
  1159. nop __LINE__
  1160. nop __LINE__
  1161. (p17) FMA f15 = f107, f56, f15
  1162. }
  1163. ;;
  1164. { .mmf
  1165. (p16) LDFPD f54, f55 = [AO4], 2 * SIZE
  1166. (p16) LDFPD f105, f106 = [BO], 2 * SIZE
  1167. (p17) FMA f8 = f108, f65, f8
  1168. }
  1169. { .mmf
  1170. nop __LINE__
  1171. nop __LINE__
  1172. (p17) FMA f9 = f109, f66, f9
  1173. }
  1174. ;;
  1175. { .mmf
  1176. (p16) LDFPD f64, f65 = [AO1], 2 * SIZE
  1177. nop __LINE__
  1178. (p17) FMA f10 = f108, f67, f10
  1179. }
  1180. { .mmf
  1181. nop __LINE__
  1182. nop __LINE__
  1183. (p17) FMA f11 = f109, f68, f11
  1184. }
  1185. ;;
  1186. { .mmf
  1187. (p16) LDFPD f66, f67 = [AO2], 2 * SIZE
  1188. nop __LINE__
  1189. (p17) FMA f12 = f108, f69, f12
  1190. }
  1191. { .mmf
  1192. nop __LINE__
  1193. nop __LINE__
  1194. (p17) FMA f13 = f109, f70, f13
  1195. }
  1196. ;;
  1197. { .mmf
  1198. (p16) LDFPD f68, f69 = [AO3], 2 * SIZE
  1199. nop __LINE__
  1200. (p17) FMA f14 = f108, f71, f14
  1201. }
  1202. { .mmf
  1203. nop __LINE__
  1204. nop __LINE__
  1205. (p17) FMA f15 = f109, f72, f15
  1206. }
  1207. ;;
  1208. { .mmf
  1209. (p16) LDFPD f70, f71 = [AO4], 2 * SIZE
  1210. (p16) LDFPD f107, f108 = [BO], 2 * SIZE
  1211. (p17) FMA f8 = f110, f81, f8
  1212. }
  1213. { .mmf
  1214. nop __LINE__
  1215. nop __LINE__
  1216. (p17) FMA f9 = f111, f82, f9
  1217. }
  1218. ;;
  1219. { .mmf
  1220. (p16) LDFPD f80, f81 = [AO1], 2 * SIZE
  1221. nop __LINE__
  1222. (p17) FMA f10 = f110, f83, f10
  1223. }
  1224. { .mmf
  1225. nop __LINE__
  1226. nop __LINE__
  1227. (p17) FMA f11 = f111, f84, f11
  1228. }
  1229. ;;
  1230. { .mmf
  1231. (p16) LDFPD f82, f83 = [AO2], 2 * SIZE
  1232. nop __LINE__
  1233. (p17) FMA f12 = f110, f85, f12
  1234. }
  1235. { .mmf
  1236. nop __LINE__
  1237. nop __LINE__
  1238. (p17) FMA f13 = f111, f86, f13
  1239. }
  1240. ;;
  1241. { .mmf
  1242. (p16) LDFPD f84, f85 = [AO3], 2 * SIZE
  1243. nop __LINE__
  1244. (p17) FMA f14 = f110, f87, f14
  1245. }
  1246. { .mfb
  1247. adds I = -1, I
  1248. (p17) FMA f15 = f111, f88, f15
  1249. br.ctop.sptk.few .L22
  1250. }
  1251. ;;
  1252. .align 16
  1253. .L25:
  1254. and I = 15, MIN_M
  1255. mov pr.rot= 0
  1256. ;;
  1257. cmp.eq p6, p0 = 0, I
  1258. cmp.eq p16, p15 = r0, r0
  1259. ;;
  1260. adds I = 1, I
  1261. ;;
  1262. shr I = I, 1
  1263. ;;
  1264. adds I = -1, I
  1265. ;;
  1266. mov ar.lc = I
  1267. mov ar.ec= 3
  1268. and I = 15, MIN_M
  1269. (p6) br.cond.dpnt .L28
  1270. ;;
  1271. .align 16
  1272. .L26:
  1273. { .mmf
  1274. (p16) LDFPD f104, f107 = [BO], 2 * SIZE
  1275. (p16) LDFPD f32, f35 = [AO1], 2 * SIZE
  1276. (p18) FMA f8 = f106, f34, f8
  1277. }
  1278. { .mmf
  1279. nop __LINE__
  1280. nop __LINE__
  1281. (p15) FMA f9 = f109, f37, f9
  1282. }
  1283. ;;
  1284. { .mmf
  1285. (p16) LDFPD f38, f41 = [AO2], 2 * SIZE
  1286. nop __LINE__
  1287. (p18) FMA f10 = f106, f40, f10
  1288. }
  1289. { .mmf
  1290. nop __LINE__
  1291. nop __LINE__
  1292. (p15) FMA f11 = f109, f43, f11
  1293. }
  1294. ;;
  1295. { .mmf
  1296. (p16) LDFPD f44, f47 = [AO3], 2 * SIZE
  1297. nop __LINE__
  1298. (p18) FMA f12 = f106, f46, f12
  1299. }
  1300. { .mmf
  1301. nop __LINE__
  1302. (p17) adds I = -2, I
  1303. (p15) FMA f13 = f109, f49, f13
  1304. }
  1305. ;;
  1306. { .mmf
  1307. (p16) LDFPD f50, f53 = [AO4], 2 * SIZE
  1308. nop __LINE__
  1309. (p15) FMA f15 = f109, f55, f15
  1310. }
  1311. { .mfb
  1312. (p17) cmp.ne.unc p15, p0 = -1, I
  1313. (p18) FMA f14 = f106, f52, f14
  1314. br.ctop.sptk.few .L26
  1315. }
  1316. ;;
  1317. .L28:
  1318. { .mmf
  1319. mov AO1 = CO
  1320. LDFD f32 = [CO], INCY
  1321. FADD f8 = f8, f9
  1322. }
  1323. ;;
  1324. { .mmf
  1325. LDFD f33 = [CO], INCY
  1326. nop __LINE__
  1327. FADD f10 = f10, f11
  1328. }
  1329. ;;
  1330. { .mmf
  1331. LDFD f34 = [CO], INCY
  1332. nop __LINE__
  1333. FADD f12 = f12, f13
  1334. }
  1335. ;;
  1336. { .mmf
  1337. LDFD f35 = [CO], INCY
  1338. nop __LINE__
  1339. FADD f14 = f14, f15
  1340. }
  1341. ;;
  1342. { .mmf
  1343. nop __LINE__
  1344. nop __LINE__
  1345. FMA f32 = ALPHA, f8, f32
  1346. }
  1347. { .mmf
  1348. nop __LINE__
  1349. nop __LINE__
  1350. FMA f33 = ALPHA, f10, f33
  1351. }
  1352. { .mmf
  1353. nop __LINE__
  1354. nop __LINE__
  1355. FMA f34 = ALPHA, f12, f34
  1356. }
  1357. { .mmf
  1358. nop __LINE__
  1359. nop __LINE__
  1360. FMA f35 = ALPHA, f14, f35
  1361. }
  1362. ;;
  1363. { .mmf
  1364. STFD [AO1] = f32
  1365. add AO1 = AO1, INCY
  1366. }
  1367. ;;
  1368. { .mmf
  1369. STFD [AO1] = f33
  1370. add AO1 = AO1, INCY
  1371. }
  1372. ;;
  1373. { .mmf
  1374. STFD [AO1] = f34
  1375. add AO1 = AO1, INCY
  1376. }
  1377. ;;
  1378. { .mmf
  1379. STFD [AO1] = f35
  1380. add AO1 = AO1, INCY
  1381. }
  1382. ;;
  1383. .align 16
  1384. .L30:
  1385. { .mfi
  1386. mov AO1 = A
  1387. mov f8 = f0
  1388. mov pr.rot= 0
  1389. }
  1390. { .mfi
  1391. add AO2 = LDA, A
  1392. mov f10 = f0
  1393. tbit.z p6, p0 = N, 1
  1394. }
  1395. ;;
  1396. { .mfi
  1397. mov BO = BUFFER
  1398. mov f12 = f0
  1399. shr I = MIN_M, 4
  1400. }
  1401. { .mfb
  1402. adds WPRE = 4 * SIZE, CO
  1403. mov f14 = f0
  1404. (p6) br.cond.dpnt .L40
  1405. }
  1406. ;;
  1407. { .mmf
  1408. (p8) LDFD f32 = [AO1], SIZE
  1409. (p8) LDFD f33 = [AO2], SIZE
  1410. mov f9 = f0
  1411. }
  1412. { .mfi
  1413. shladd A = LDA, 1, A
  1414. mov f11 = f0
  1415. mov ar.ec= 2
  1416. }
  1417. ;;
  1418. { .mmf
  1419. (p8) LDFD f40 = [BO], 2 * SIZE
  1420. cmp.eq p6, p0 = 0, I
  1421. mov f13 = f0
  1422. }
  1423. { .mmf
  1424. adds RPRE1 = RPREFETCH * SIZE, AO1
  1425. add I = I, I
  1426. mov f15 = f0
  1427. }
  1428. ;;
  1429. { .mmi
  1430. cmp.eq p16, p0 = r0, r0
  1431. adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2
  1432. adds I = -1, I
  1433. }
  1434. ;;
  1435. { .mfi
  1436. lfetch.excl.nt1 [WPRE]
  1437. (p8) FMPY f8 = f40, f32
  1438. mov ar.lc = I
  1439. }
  1440. { .mfb
  1441. adds PREB = RPREFETCH * SIZE, BO
  1442. (p8) FMPY f10 = f40, f33
  1443. (p6) br.cond.dpnt .L35
  1444. }
  1445. ;;
  1446. .align 16
  1447. .L32:
  1448. { .mmf
  1449. (p17) LDFPD f83, f84 = [AO2], 2 * SIZE
  1450. (p17) LDFPD f110, f111 = [BO], 2 * SIZE
  1451. (p17) FMA f8 = f104, f33, f8
  1452. }
  1453. { .mfi
  1454. nop __LINE__
  1455. (p17) FMA f9 = f105, f34, f9
  1456. (p16) tbit.nz.unc p14, p15 = I, 0
  1457. }
  1458. ;;
  1459. { .mmf
  1460. (p14) PREFETCH [RPRE1], 16 * SIZE
  1461. (p16) LDFPD f32, f33 = [AO1], 2 * SIZE
  1462. (p17) FMA f10 = f104, f35, f10
  1463. }
  1464. { .mmf
  1465. nop __LINE__
  1466. nop __LINE__
  1467. (p17) FMA f11 = f105, f36, f11
  1468. }
  1469. ;;
  1470. { .mmf
  1471. (p15) PREFETCH [RPRE2], 16 * SIZE
  1472. (p16) LDFPD f34, f35 = [AO2], 2 * SIZE
  1473. (p17) FMA f8 = f106, f49, f8
  1474. }
  1475. { .mmf
  1476. nop __LINE__
  1477. nop __LINE__
  1478. (p17) FMA f9 = f107, f50, f9
  1479. }
  1480. ;;
  1481. { .mmf
  1482. (p14) PREFETCH [PREB], 16 * SIZE
  1483. (p16) LDFPD f48, f49 = [AO1], 2 * SIZE
  1484. (p17) FMA f10 = f106, f51, f10
  1485. }
  1486. { .mmf
  1487. nop __LINE__
  1488. nop __LINE__
  1489. (p17) FMA f11 = f107, f52, f11
  1490. }
  1491. ;;
  1492. { .mmf
  1493. (p16) LDFPD f50, f51 = [AO2], 2 * SIZE
  1494. (p16) LDFPD f103, f104 = [BO], 2 * SIZE
  1495. (p17) FMA f8 = f108, f65, f8
  1496. }
  1497. { .mmf
  1498. nop __LINE__
  1499. nop __LINE__
  1500. (p17) FMA f9 = f109, f66, f9
  1501. }
  1502. ;;
  1503. { .mmf
  1504. (p16) LDFPD f105, f106 = [BO], 2 * SIZE
  1505. (p16) LDFPD f64, f65 = [AO1], 2 * SIZE
  1506. (p17) FMA f10 = f108, f67, f10
  1507. }
  1508. { .mmf
  1509. nop __LINE__
  1510. nop __LINE__
  1511. (p17) FMA f11 = f109, f68, f11
  1512. }
  1513. ;;
  1514. { .mmf
  1515. (p16) LDFPD f66, f67 = [AO2], 2 * SIZE
  1516. (p16) LDFPD f107, f108 = [BO], 2 * SIZE
  1517. (p17) FMA f8 = f110, f81, f8
  1518. }
  1519. { .mmf
  1520. nop __LINE__
  1521. nop __LINE__
  1522. (p17) FMA f9 = f111, f82, f9
  1523. }
  1524. ;;
  1525. { .mmf
  1526. (p16) LDFPD f80, f81 = [AO1], 2 * SIZE
  1527. nop __LINE__
  1528. (p17) FMA f10 = f110, f83, f10
  1529. }
  1530. { .mfb
  1531. adds I = -1, I
  1532. (p17) FMA f11 = f111, f84, f11
  1533. br.ctop.sptk.few .L32
  1534. }
  1535. ;;
  1536. .align 16
  1537. .L35:
  1538. and I = 15, MIN_M
  1539. ;;
  1540. cmp.eq p6, p0 = 0, I
  1541. (p6) br.cond.dpnt .L38
  1542. ;;
  1543. tbit.nz p12, p0 = MIN_M, 3
  1544. tbit.nz p13, p0 = MIN_M, 2
  1545. tbit.nz p14, p0 = MIN_M, 1
  1546. tbit.nz p15, p0 = MIN_M, 0
  1547. ;;
  1548. (p12) LDFPD f32, f33 = [AO1], 2 * SIZE
  1549. (p12) LDFPD f34, f35 = [AO2], 2 * SIZE
  1550. (p12) LDFPD f100, f101 = [BO], 2 * SIZE
  1551. ;;
  1552. (p12) LDFPD f36, f37 = [AO1], 2 * SIZE
  1553. (p12) LDFPD f38, f39 = [AO2], 2 * SIZE
  1554. (p12) LDFPD f102, f103 = [BO], 2 * SIZE
  1555. ;;
  1556. (p12) LDFPD f40, f41 = [AO1], 2 * SIZE
  1557. (p12) LDFPD f42, f43 = [AO2], 2 * SIZE
  1558. (p12) LDFPD f104, f105 = [BO], 2 * SIZE
  1559. ;;
  1560. (p12) LDFPD f44, f45 = [AO1], 2 * SIZE
  1561. (p12) LDFPD f46, f47 = [AO2], 2 * SIZE
  1562. (p12) LDFPD f106, f107 = [BO], 2 * SIZE
  1563. ;;
  1564. (p13) LDFPD f48, f49 = [AO1], 2 * SIZE
  1565. (p13) LDFPD f50, f51 = [AO2], 2 * SIZE
  1566. (p13) LDFPD f108, f109 = [BO], 2 * SIZE
  1567. ;;
  1568. (p13) LDFPD f52, f53 = [AO1], 2 * SIZE
  1569. (p13) LDFPD f54, f55 = [AO2], 2 * SIZE
  1570. (p13) LDFPD f110, f111 = [BO], 2 * SIZE
  1571. ;;
  1572. (p14) LDFPD f56, f57 = [AO1], 2 * SIZE
  1573. (p14) LDFPD f58, f59 = [AO2], 2 * SIZE
  1574. (p14) LDFPD f112, f113 = [BO], 2 * SIZE
  1575. ;;
  1576. (p15) LDFD f60 = [AO1]
  1577. (p15) LDFD f61 = [AO2]
  1578. (p15) LDFD f114 = [BO]
  1579. ;;
  1580. (p12) FMA f8 = f100, f32, f8
  1581. (p12) FMA f9 = f101, f33, f9
  1582. (p12) FMA f10 = f100, f34, f10
  1583. (p12) FMA f11 = f101, f35, f11
  1584. ;;
  1585. (p12) FMA f12 = f102, f36, f12
  1586. (p12) FMA f13 = f103, f37, f13
  1587. (p12) FMA f14 = f102, f38, f14
  1588. (p12) FMA f15 = f103, f39, f15
  1589. ;;
  1590. (p12) FMA f8 = f104, f40, f8
  1591. (p12) FMA f9 = f105, f41, f9
  1592. (p12) FMA f10 = f104, f42, f10
  1593. (p12) FMA f11 = f105, f43, f11
  1594. ;;
  1595. (p12) FMA f12 = f106, f44, f12
  1596. (p12) FMA f13 = f107, f45, f13
  1597. (p12) FMA f14 = f106, f46, f14
  1598. (p12) FMA f15 = f107, f47, f15
  1599. ;;
  1600. (p13) FMA f8 = f108, f48, f8
  1601. (p13) FMA f9 = f109, f49, f9
  1602. (p13) FMA f10 = f108, f50, f10
  1603. (p13) FMA f11 = f109, f51, f11
  1604. ;;
  1605. (p13) FMA f12 = f110, f52, f12
  1606. (p13) FMA f13 = f111, f53, f13
  1607. (p13) FMA f14 = f110, f54, f14
  1608. (p13) FMA f15 = f111, f55, f15
  1609. ;;
  1610. (p14) FMA f8 = f112, f56, f8
  1611. (p14) FMA f9 = f113, f57, f9
  1612. (p14) FMA f10 = f112, f58, f10
  1613. (p14) FMA f11 = f113, f59, f11
  1614. ;;
  1615. (p15) FMA f12 = f114, f60, f12
  1616. (p15) FMA f14 = f114, f61, f14
  1617. ;;
  1618. .L38:
  1619. FADD f8 = f8, f9
  1620. FADD f10 = f10, f11
  1621. FADD f12 = f12, f13
  1622. FADD f14 = f14, f15
  1623. ;;
  1624. FADD f8 = f8, f12
  1625. FADD f10 = f10, f14
  1626. ;;
  1627. { .mmf
  1628. mov AO1 = CO
  1629. LDFD f32 = [CO], INCY
  1630. }
  1631. ;;
  1632. { .mmf
  1633. LDFD f33 = [CO], INCY
  1634. nop __LINE__
  1635. }
  1636. ;;
  1637. { .mmf
  1638. nop __LINE__
  1639. nop __LINE__
  1640. FMA f32 = ALPHA, f8, f32
  1641. }
  1642. { .mmf
  1643. nop __LINE__
  1644. nop __LINE__
  1645. FMA f33 = ALPHA, f10, f33
  1646. }
  1647. ;;
  1648. { .mmf
  1649. STFD [AO1] = f32
  1650. add AO1 = AO1, INCY
  1651. }
  1652. ;;
  1653. { .mmf
  1654. STFD [AO1] = f33
  1655. }
  1656. ;;
  1657. .align 16
  1658. .L40:
  1659. { .mfi
  1660. mov AO1 = A
  1661. mov f8 = f0
  1662. shr I = MIN_M, 4
  1663. }
  1664. { .mfi
  1665. mov BO = BUFFER
  1666. mov f10 = f0
  1667. tbit.z p7, p0 = N, 0
  1668. }
  1669. ;;
  1670. { .mfi
  1671. cmp.eq p6, p0 = 0, I
  1672. mov f12 = f0
  1673. mov pr.rot= 0
  1674. }
  1675. { .mfb
  1676. add I = I, I
  1677. mov f14 = f0
  1678. (p7) br.cond.dpnt .L99
  1679. }
  1680. ;;
  1681. { .mfi
  1682. (p8) LDFD f32 = [AO1], SIZE
  1683. mov f9 = f0
  1684. mov ar.ec= 2
  1685. }
  1686. { .mmf
  1687. (p8) LDFD f40 = [BO], 2 * SIZE
  1688. add A = A, LDA
  1689. mov f11 = f0
  1690. }
  1691. ;;
  1692. { .mmf
  1693. adds WPRE = 1 * SIZE, CO
  1694. adds PREB = RPREFETCH * SIZE, BO
  1695. mov f13 = f0
  1696. }
  1697. { .mmf
  1698. cmp.eq p16, p0 = r0, r0
  1699. adds I = -1, I
  1700. mov f15 = f0
  1701. }
  1702. ;;
  1703. { .mfi
  1704. lfetch.excl.nt1 [WPRE]
  1705. (p8) FMPY f8 = f40, f32
  1706. mov ar.lc = I
  1707. }
  1708. { .mmb
  1709. nop __LINE__
  1710. nop __LINE__
  1711. (p6) br.cond.dpnt .L45
  1712. }
  1713. ;;
  1714. .align 16
  1715. .L42:
  1716. { .mmf
  1717. (p17) LDFPD f81, f82 = [AO1], 2 * SIZE
  1718. (p17) LDFPD f110, f111 = [BO], 2 * SIZE
  1719. (p17) FMA f8 = f104, f33, f8
  1720. }
  1721. { .mfi
  1722. nop __LINE__
  1723. (p17) FMA f9 = f105, f34, f9
  1724. (p16) tbit.nz.unc p14, p15 = I, 0
  1725. }
  1726. ;;
  1727. { .mmf
  1728. (p16) LDFPD f32, f33 = [AO1], 2 * SIZE
  1729. (p16) LDFPD f103, f104 = [BO], 2 * SIZE
  1730. (p17) FMA f8 = f106, f49, f8
  1731. }
  1732. { .mmf
  1733. nop __LINE__
  1734. nop __LINE__
  1735. (p17) FMA f9 = f107, f50, f9
  1736. }
  1737. ;;
  1738. { .mmf
  1739. (p16) LDFPD f105, f106 = [BO], 2 * SIZE
  1740. (p16) LDFPD f48, f49 = [AO1], 2 * SIZE
  1741. (p17) FMA f8 = f108, f65, f8
  1742. }
  1743. { .mmf
  1744. nop __LINE__
  1745. nop __LINE__
  1746. (p17) FMA f9 = f109, f66, f9
  1747. }
  1748. ;;
  1749. { .mmf
  1750. (p16) LDFPD f64, f65 = [AO1], 2 * SIZE
  1751. (p16) LDFPD f107, f108 = [BO], 2 * SIZE
  1752. (p17) FMA f8 = f110, f81, f8
  1753. }
  1754. { .mfb
  1755. adds I = -1, I
  1756. (p17) FMA f9 = f111, f82, f9
  1757. br.ctop.sptk.few .L42
  1758. }
  1759. ;;
  1760. .align 16
  1761. .L45:
  1762. and I = 15, MIN_M
  1763. ;;
  1764. cmp.eq p6, p0 = 0, I
  1765. (p6) br.cond.dpnt .L48
  1766. ;;
  1767. tbit.nz p12, p0 = MIN_M, 3
  1768. tbit.nz p13, p0 = MIN_M, 2
  1769. tbit.nz p14, p0 = MIN_M, 1
  1770. tbit.nz p15, p0 = MIN_M, 0
  1771. ;;
  1772. (p12) LDFPD f32, f33 = [AO1], 2 * SIZE
  1773. (p12) LDFPD f100, f101 = [BO], 2 * SIZE
  1774. ;;
  1775. (p12) LDFPD f36, f37 = [AO1], 2 * SIZE
  1776. (p12) LDFPD f102, f103 = [BO], 2 * SIZE
  1777. ;;
  1778. (p12) LDFPD f40, f41 = [AO1], 2 * SIZE
  1779. (p12) LDFPD f104, f105 = [BO], 2 * SIZE
  1780. ;;
  1781. (p12) LDFPD f44, f45 = [AO1], 2 * SIZE
  1782. (p12) LDFPD f106, f107 = [BO], 2 * SIZE
  1783. ;;
  1784. (p13) LDFPD f48, f49 = [AO1], 2 * SIZE
  1785. (p13) LDFPD f108, f109 = [BO], 2 * SIZE
  1786. ;;
  1787. (p13) LDFPD f52, f53 = [AO1], 2 * SIZE
  1788. (p13) LDFPD f110, f111 = [BO], 2 * SIZE
  1789. ;;
  1790. (p14) LDFPD f56, f57 = [AO1], 2 * SIZE
  1791. (p14) LDFPD f112, f113 = [BO], 2 * SIZE
  1792. ;;
  1793. (p15) LDFD f60 = [AO1]
  1794. (p15) LDFD f114 = [BO]
  1795. ;;
  1796. (p12) FMA f8 = f100, f32, f8
  1797. (p12) FMA f9 = f101, f33, f9
  1798. (p12) FMA f10 = f102, f36, f10
  1799. (p12) FMA f11 = f103, f37, f11
  1800. (p12) FMA f12 = f104, f40, f12
  1801. (p12) FMA f13 = f105, f41, f13
  1802. (p12) FMA f14 = f106, f44, f14
  1803. (p12) FMA f15 = f107, f45, f15
  1804. ;;
  1805. (p13) FMA f8 = f108, f48, f8
  1806. (p13) FMA f9 = f109, f49, f9
  1807. (p13) FMA f10 = f110, f52, f10
  1808. (p13) FMA f11 = f111, f53, f11
  1809. (p14) FMA f12 = f112, f56, f12
  1810. (p14) FMA f13 = f113, f57, f13
  1811. (p15) FMA f14 = f114, f60, f14
  1812. ;;
  1813. .L48:
  1814. { .mmf
  1815. LDFD f32 = [CO]
  1816. nop __LINE__
  1817. FADD f8 = f8, f9
  1818. }
  1819. { .mmf
  1820. nop __LINE__
  1821. nop __LINE__
  1822. FADD f10 = f10, f11
  1823. }
  1824. ;;
  1825. { .mmf
  1826. nop __LINE__
  1827. nop __LINE__
  1828. FADD f12 = f12, f13
  1829. }
  1830. { .mmf
  1831. nop __LINE__
  1832. nop __LINE__
  1833. FADD f14 = f14, f15
  1834. }
  1835. ;;
  1836. { .mmf
  1837. nop __LINE__
  1838. nop __LINE__
  1839. FADD f8 = f8, f12
  1840. }
  1841. { .mmf
  1842. nop __LINE__
  1843. nop __LINE__
  1844. FADD f10 = f10, f14
  1845. }
  1846. ;;
  1847. { .mmf
  1848. nop __LINE__
  1849. nop __LINE__
  1850. FADD f8 = f8, f10
  1851. }
  1852. ;;
  1853. { .mmf
  1854. nop __LINE__
  1855. nop __LINE__
  1856. FMA f32 = ALPHA, f8, f32
  1857. }
  1858. ;;
  1859. { .mmf
  1860. STFD [CO] = f32
  1861. }
  1862. ;;
  1863. .align 16
  1864. .L99:
  1865. adds IS = P, IS
  1866. shladd A = LDAP, BASE_SHIFT, A
  1867. ;;
  1868. cmp.gt p6, p0 = M, IS
  1869. (p6) br.cond.dptk .LIs_loop
  1870. br .L999
  1871. .align 4
  1872. ;;
  1873. .L100:
  1874. shr J = N, 3
  1875. mov CO = Y
  1876. ;;
  1877. cmp.eq p6, p0 = r0, J
  1878. (p6) br.cond.dpnt .L120
  1879. ;;
  1880. .align 16
  1881. .L111:
  1882. { .mfi
  1883. mov AO1 = A
  1884. mov f8 = f0
  1885. mov pr.rot= 0
  1886. }
  1887. { .mfi
  1888. add AO2 = LDA, A
  1889. mov f10 = f0
  1890. shr I = MIN_M, 4
  1891. }
  1892. ;;
  1893. { .mmf
  1894. shladd AO3 = LDA, 1, A
  1895. shladd AO4 = LDA, 1, AO2
  1896. mov f12 = f0
  1897. }
  1898. { .mmf
  1899. (p8) LDFD f32 = [AO1], SIZE
  1900. (p8) LDFD f33 = [AO2], SIZE
  1901. mov f14 = f0
  1902. }
  1903. ;;
  1904. { .mmf
  1905. shladd AO5 = LDA, 1, AO3
  1906. shladd AO6 = LDA, 1, AO4
  1907. mov f16 = f0
  1908. }
  1909. { .mmf
  1910. (p8) LDFD f34 = [AO3], SIZE
  1911. (p8) LDFD f35 = [AO4], SIZE
  1912. mov f18 = f0
  1913. }
  1914. ;;
  1915. { .mmf
  1916. shladd AO7 = LDA, 1, AO5
  1917. shladd AO8 = LDA, 1, AO6
  1918. mov f20 = f0
  1919. }
  1920. { .mmf
  1921. (p8) LDFD f36 = [AO5], SIZE
  1922. (p8) LDFD f37 = [AO6], SIZE
  1923. mov f22 = f0
  1924. }
  1925. ;;
  1926. { .mfi
  1927. (p8) LDFD f38 = [AO7], SIZE
  1928. mov f9 = f0
  1929. mov ar.ec= 2
  1930. }
  1931. { .mmf
  1932. (p8) LDFD f39 = [AO8], SIZE
  1933. mov BO = BUFFER
  1934. mov f11 = f0
  1935. }
  1936. ;;
  1937. { .mmf
  1938. (p8) LDFD f40 = [BO], 2 * SIZE
  1939. cmp.eq p6, p0 = 0, I
  1940. mov f13 = f0
  1941. }
  1942. { .mmf
  1943. shladd A = LDA, 3, A
  1944. cmp.eq p16, p0 = r0, r0
  1945. mov f15 = f0
  1946. }
  1947. ;;
  1948. { .mmf
  1949. add I = I, I
  1950. nop __LINE__
  1951. mov f17 = f0
  1952. }
  1953. { .mmf
  1954. adds RPRE1 = RPREFETCH * SIZE, AO1
  1955. adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2
  1956. mov f19 = f0
  1957. }
  1958. ;;
  1959. { .mmf
  1960. adds I = -1, I
  1961. nop __LINE__
  1962. mov f21 = f0
  1963. }
  1964. { .mmf
  1965. adds RPRE3 = RPREFETCH * SIZE, AO3
  1966. adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4
  1967. mov f23 = f0
  1968. }
  1969. ;;
  1970. { .mmf
  1971. nop __LINE__
  1972. nop __LINE__
  1973. (p8) FMPY f8 = f40, f32
  1974. }
  1975. { .mmf
  1976. adds RPRE5 = RPREFETCH * SIZE, AO5
  1977. adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6
  1978. (p8) FMPY f10 = f40, f33
  1979. }
  1980. ;;
  1981. { .mmf
  1982. adds AO21 = 7 * SIZE, AO2
  1983. adds AO41 = 7 * SIZE, AO4
  1984. (p8) FMPY f12 = f40, f34
  1985. }
  1986. { .mmf
  1987. adds RPRE7 = RPREFETCH * SIZE, AO7
  1988. adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8
  1989. (p8) FMPY f14 = f40, f35
  1990. }
  1991. ;;
  1992. { .mfi
  1993. nop __LINE__
  1994. (p8) FMPY f16 = f40, f36
  1995. mov ar.lc = I
  1996. }
  1997. { .mmf
  1998. adds WPRE = 8 * SIZE, CO
  1999. adds PREB = RPREFETCH * SIZE, BO
  2000. (p8) FMPY f18 = f40, f37
  2001. }
  2002. ;;
  2003. { .mmf
  2004. lfetch.excl.nt1 [WPRE]
  2005. adds AO61 = 7 * SIZE, AO6
  2006. (p8) FMPY f20 = f40, f38
  2007. }
  2008. { .mfb
  2009. adds AO81 = 7 * SIZE, AO8
  2010. (p8) FMPY f22 = f40, f39
  2011. (p6) br.cond.dpnt .L115
  2012. }
  2013. ;;
  2014. .align 16
  2015. .L112:
  2016. { .mmf
  2017. (p17) LDFPD f80, f95 = [AO8]
  2018. (p17) LDFPD f110, f111 = [BO], 2 * SIZE
  2019. (p17) FMA f8 = f104, f33, f8
  2020. }
  2021. { .mfi
  2022. (p17) adds AO8 = 3 * SIZE, AO8
  2023. (p17) FMA f9 = f105, f34, f9
  2024. (p16) tbit.nz.unc p14, p15 = I, 0
  2025. }
  2026. ;;
  2027. { .mmf
  2028. (p14) PREFETCH [RPRE1], 16 * SIZE
  2029. (p16) LDFPD f32, f33 = [AO1], 2 * SIZE
  2030. (p17) FMA f10 = f104, f35, f10
  2031. }
  2032. { .mmf
  2033. nop __LINE__
  2034. nop __LINE__
  2035. (p17) FMA f11 = f105, f36, f11
  2036. }
  2037. ;;
  2038. { .mmf
  2039. (p15) PREFETCH [RPRE2], 16 * SIZE
  2040. (p16) LDFD f34 = [AO2], 1 * SIZE
  2041. (p17) FMA f12 = f104, f37, f12
  2042. }
  2043. { .mmf
  2044. (p17) LDFD f84 = [AO21], 8 * SIZE
  2045. nop __LINE__
  2046. (p17) FMA f13 = f105, f38, f13
  2047. }
  2048. ;;
  2049. { .mmf
  2050. (p14) PREFETCH [RPRE3], 16 * SIZE
  2051. (p16) LDFPD f36, f37 = [AO3], 2 * SIZE
  2052. (p17) FMA f14 = f104, f39, f14
  2053. }
  2054. { .mmf
  2055. nop __LINE__
  2056. nop __LINE__
  2057. (p17) FMA f15 = f105, f40, f15
  2058. }
  2059. ;;
  2060. { .mmf
  2061. (p15) PREFETCH [RPRE4], 16 * SIZE
  2062. (p16) LDFD f38 = [AO4], 1 * SIZE
  2063. (p17) FMA f16 = f104, f41, f16
  2064. }
  2065. { .mmf
  2066. (p17) LDFD f88 = [AO41], 8 * SIZE
  2067. nop __LINE__
  2068. (p17) FMA f17 = f105, f42, f17
  2069. }
  2070. ;;
  2071. { .mmf
  2072. (p14) PREFETCH [RPRE5], 16 * SIZE
  2073. (p16) LDFPD f40, f41 = [AO5], 2 * SIZE
  2074. (p17) FMA f18 = f104, f43, f18
  2075. }
  2076. { .mmf
  2077. nop __LINE__
  2078. nop __LINE__
  2079. (p17) FMA f19 = f105, f44, f19
  2080. }
  2081. ;;
  2082. { .mmf
  2083. (p15) PREFETCH [RPRE6], 16 * SIZE
  2084. (p16) LDFD f42 = [AO6], 1 * SIZE
  2085. (p17) FMA f20 = f104, f45, f20
  2086. }
  2087. { .mmf
  2088. (p17) LDFD f92 = [AO61], 8 * SIZE
  2089. nop __LINE__
  2090. (p17) FMA f21 = f105, f46, f21
  2091. }
  2092. ;;
  2093. { .mmf
  2094. (p14) PREFETCH [RPRE7], 16 * SIZE
  2095. (p16) LDFPD f44, f45 = [AO7], 2 * SIZE
  2096. (p17) FMA f22 = f104, f47, f22
  2097. }
  2098. { .mmf
  2099. nop __LINE__
  2100. nop __LINE__
  2101. (p17) FMA f23 = f105, f48, f23
  2102. }
  2103. ;;
  2104. { .mmf
  2105. (p15) PREFETCH [RPRE8], 16 * SIZE
  2106. (p16) LDFD f46 = [AO8], 1 * SIZE
  2107. (p17) FMA f8 = f106, f49, f8
  2108. }
  2109. { .mmf
  2110. (p17) LDFD f96 = [AO81], 8 * SIZE
  2111. nop __LINE__
  2112. (p17) FMA f9 = f107, f50, f9
  2113. }
  2114. ;;
  2115. { .mmf
  2116. (p14) PREFETCH [PREB], 16 * SIZE
  2117. (p16) LDFPD f48, f49 = [AO1], 2 * SIZE
  2118. (p17) FMA f10 = f106, f51, f10
  2119. }
  2120. { .mmf
  2121. nop __LINE__
  2122. nop __LINE__
  2123. (p17) FMA f11 = f107, f52, f11
  2124. }
  2125. ;;
  2126. { .mmf
  2127. (p16) LDFPD f35, f50 = [AO2], 2 * SIZE
  2128. (p16) LDFPD f103, f104 = [BO], 2 * SIZE
  2129. (p17) FMA f12 = f106, f53, f12
  2130. }
  2131. { .mmf
  2132. nop __LINE__
  2133. nop __LINE__
  2134. (p17) FMA f13 = f107, f54, f13
  2135. }
  2136. ;;
  2137. { .mmf
  2138. (p16) LDFPD f52, f53 = [AO3], 2 * SIZE
  2139. nop __LINE__
  2140. (p17) FMA f14 = f106, f55, f14
  2141. }
  2142. { .mmf
  2143. nop __LINE__
  2144. nop __LINE__
  2145. (p17) FMA f15 = f107, f56, f15
  2146. }
  2147. ;;
  2148. { .mmf
  2149. (p16) LDFPD f39, f54 = [AO4], 2 * SIZE
  2150. nop __LINE__
  2151. (p17) FMA f16 = f106, f57, f16
  2152. }
  2153. { .mmf
  2154. nop __LINE__
  2155. nop __LINE__
  2156. (p17) FMA f17 = f107, f58, f17
  2157. }
  2158. ;;
  2159. { .mmf
  2160. (p16) LDFPD f56, f57 = [AO5], 2 * SIZE
  2161. nop __LINE__
  2162. (p17) FMA f18 = f106, f59, f18
  2163. }
  2164. { .mmf
  2165. nop __LINE__
  2166. nop __LINE__
  2167. (p17) FMA f19 = f107, f60, f19
  2168. }
  2169. ;;
  2170. { .mmf
  2171. (p16) LDFPD f43, f58 = [AO6], 2 * SIZE
  2172. nop __LINE__
  2173. (p17) FMA f20 = f106, f61, f20
  2174. }
  2175. { .mmf
  2176. nop __LINE__
  2177. nop __LINE__
  2178. (p17) FMA f21 = f107, f62, f21
  2179. }
  2180. ;;
  2181. { .mmf
  2182. (p16) LDFPD f60, f61 = [AO7], 2 * SIZE
  2183. nop __LINE__
  2184. (p17) FMA f22 = f106, f63, f22
  2185. }
  2186. { .mmf
  2187. nop __LINE__
  2188. nop __LINE__
  2189. (p17) FMA f23 = f107, f64, f23
  2190. }
  2191. ;;
  2192. { .mmf
  2193. (p16) LDFPD f47, f62 = [AO8], 2 * SIZE
  2194. (p16) LDFPD f105, f106 = [BO], 2 * SIZE
  2195. (p17) FMA f8 = f108, f65, f8
  2196. }
  2197. { .mmf
  2198. nop __LINE__
  2199. nop __LINE__
  2200. (p17) FMA f9 = f109, f66, f9
  2201. }
  2202. ;;
  2203. { .mmf
  2204. (p16) LDFPD f64, f65 = [AO1], 2 * SIZE
  2205. nop __LINE__
  2206. (p17) FMA f10 = f108, f67, f10
  2207. }
  2208. { .mmf
  2209. nop __LINE__
  2210. nop __LINE__
  2211. (p17) FMA f11 = f109, f68, f11
  2212. }
  2213. ;;
  2214. { .mmf
  2215. (p16) LDFPD f51, f66 = [AO2], 2 * SIZE
  2216. nop __LINE__
  2217. (p17) FMA f12 = f108, f69, f12
  2218. }
  2219. { .mmf
  2220. nop __LINE__
  2221. nop __LINE__
  2222. (p17) FMA f13 = f109, f70, f13
  2223. }
  2224. ;;
  2225. { .mmf
  2226. (p16) LDFPD f68, f69 = [AO3], 2 * SIZE
  2227. nop __LINE__
  2228. (p17) FMA f14 = f108, f71, f14
  2229. }
  2230. { .mmf
  2231. nop __LINE__
  2232. nop __LINE__
  2233. (p17) FMA f15 = f109, f72, f15
  2234. }
  2235. ;;
  2236. { .mmf
  2237. (p16) LDFPD f55, f70 = [AO4], 2 * SIZE
  2238. nop __LINE__
  2239. (p17) FMA f16 = f108, f73, f16
  2240. }
  2241. { .mmf
  2242. nop __LINE__
  2243. nop __LINE__
  2244. (p17) FMA f17 = f109, f74, f17
  2245. }
  2246. ;;
  2247. { .mmf
  2248. (p16) LDFPD f72, f73 = [AO5], 2 * SIZE
  2249. nop __LINE__
  2250. (p17) FMA f18 = f108, f75, f18
  2251. }
  2252. { .mmf
  2253. nop __LINE__
  2254. nop __LINE__
  2255. (p17) FMA f19 = f109, f76, f19
  2256. }
  2257. ;;
  2258. { .mmf
  2259. (p16) LDFPD f59, f74 = [AO6], 2 * SIZE
  2260. nop __LINE__
  2261. (p17) FMA f20 = f108, f77, f20
  2262. }
  2263. { .mmf
  2264. nop __LINE__
  2265. nop __LINE__
  2266. (p17) FMA f21 = f109, f78, f21
  2267. }
  2268. ;;
  2269. { .mmf
  2270. (p16) LDFPD f76, f77 = [AO7], 2 * SIZE
  2271. nop __LINE__
  2272. (p17) FMA f22 = f108, f79, f22
  2273. }
  2274. { .mmf
  2275. nop __LINE__
  2276. nop __LINE__
  2277. (p17) FMA f23 = f109, f80, f23
  2278. }
  2279. ;;
  2280. { .mmf
  2281. (p16) LDFPD f63, f78 = [AO8], 2 * SIZE
  2282. (p16) LDFPD f107, f108 = [BO], 2 * SIZE
  2283. (p17) FMA f8 = f110, f81, f8
  2284. }
  2285. { .mmf
  2286. nop __LINE__
  2287. nop __LINE__
  2288. (p17) FMA f9 = f111, f82, f9
  2289. }
  2290. ;;
  2291. { .mmf
  2292. (p16) LDFPD f80, f81 = [AO1], 2 * SIZE
  2293. nop __LINE__
  2294. (p17) FMA f10 = f110, f83, f10
  2295. }
  2296. { .mmf
  2297. nop __LINE__
  2298. nop __LINE__
  2299. (p17) FMA f11 = f111, f84, f11
  2300. }
  2301. ;;
  2302. { .mmf
  2303. (p16) LDFPD f67, f82 = [AO2]
  2304. nop __LINE__
  2305. (p17) FMA f12 = f110, f85, f12
  2306. }
  2307. { .mmf
  2308. nop __LINE__
  2309. (p16) adds AO2 = 3 * SIZE, AO2
  2310. (p17) FMA f13 = f111, f86, f13
  2311. }
  2312. ;;
  2313. { .mmf
  2314. (p16) LDFPD f84, f85 = [AO3], 2 * SIZE
  2315. nop __LINE__
  2316. (p17) FMA f14 = f110, f87, f14
  2317. }
  2318. { .mmf
  2319. nop __LINE__
  2320. nop __LINE__
  2321. (p17) FMA f15 = f111, f88, f15
  2322. }
  2323. ;;
  2324. { .mmf
  2325. (p16) LDFPD f71, f86 = [AO4]
  2326. nop __LINE__
  2327. (p17) FMA f16 = f110, f89, f16
  2328. }
  2329. { .mmf
  2330. nop __LINE__
  2331. (p16) adds AO4 = 3 * SIZE, AO4
  2332. (p17) FMA f17 = f111, f90, f17
  2333. }
  2334. ;;
  2335. { .mmf
  2336. (p16) LDFPD f88, f89 = [AO5], 2 * SIZE
  2337. nop __LINE__
  2338. (p17) FMA f18 = f110, f91, f18
  2339. }
  2340. { .mmf
  2341. nop __LINE__
  2342. nop __LINE__
  2343. (p17) FMA f19 = f111, f92, f19
  2344. }
  2345. ;;
  2346. { .mmf
  2347. (p16) LDFPD f75, f90 = [AO6]
  2348. nop __LINE__
  2349. (p17) FMA f20 = f110, f93, f20
  2350. }
  2351. { .mmf
  2352. nop __LINE__
  2353. (p16) adds AO6 = 3 * SIZE, AO6
  2354. (p17) FMA f21 = f111, f94, f21
  2355. }
  2356. ;;
  2357. { .mmf
  2358. (p16) LDFPD f92, f93 = [AO7], 2 * SIZE
  2359. nop __LINE__
  2360. (p17) FMA f22 = f110, f95, f22
  2361. }
  2362. { .mfb
  2363. adds I = -1, I
  2364. (p17) FMA f23 = f111, f96, f23
  2365. br.ctop.sptk.few .L112
  2366. }
  2367. ;;
  2368. .align 16
  2369. .L115:
  2370. and I = 15, MIN_M
  2371. mov pr.rot= 0
  2372. ;;
  2373. cmp.eq p6, p0 = 0, I
  2374. cmp.eq p16, p15 = r0, r0
  2375. ;;
  2376. adds I = 1, I
  2377. ;;
  2378. shr I = I, 1
  2379. ;;
  2380. adds I = -1, I
  2381. adds AO21 = 1 * SIZE, AO2
  2382. adds AO41 = 1 * SIZE, AO4
  2383. adds AO61 = 1 * SIZE, AO6
  2384. adds AO81 = 1 * SIZE, AO8
  2385. ;;
  2386. mov ar.lc = I
  2387. mov ar.ec= 3
  2388. and I = 15, MIN_M
  2389. (p6) br.cond.dpnt .L118
  2390. ;;
  2391. .align 16
  2392. .L116:
  2393. { .mmf
  2394. (p16) LDFPD f104, f107 = [BO], 2 * SIZE
  2395. (p16) LDFPD f32, f35 = [AO1], 2 * SIZE
  2396. (p18) FMA f8 = f106, f34, f8
  2397. }
  2398. { .mmf
  2399. nop __LINE__
  2400. nop __LINE__
  2401. (p15) FMA f9 = f109, f37, f9
  2402. }
  2403. ;;
  2404. { .mmf
  2405. (p16) LDFD f38 = [AO2], 2 * SIZE
  2406. (p17) LDFD f42 = [AO21], 2 * SIZE
  2407. (p18) FMA f10 = f106, f40, f10
  2408. }
  2409. { .mmf
  2410. nop __LINE__
  2411. nop __LINE__
  2412. (p15) FMA f11 = f109, f43, f11
  2413. }
  2414. ;;
  2415. { .mmf
  2416. (p16) LDFPD f44, f47 = [AO3], 2 * SIZE
  2417. nop __LINE__
  2418. (p18) FMA f12 = f106, f46, f12
  2419. }
  2420. { .mmf
  2421. nop __LINE__
  2422. nop __LINE__
  2423. (p15) FMA f13 = f109, f49, f13
  2424. }
  2425. ;;
  2426. { .mmf
  2427. (p16) LDFD f50 = [AO4], 2 * SIZE
  2428. (p17) LDFD f54 = [AO41], 2 * SIZE
  2429. (p18) FMA f14 = f106, f52, f14
  2430. }
  2431. { .mmf
  2432. nop __LINE__
  2433. nop __LINE__
  2434. (p15) FMA f15 = f109, f55, f15
  2435. }
  2436. ;;
  2437. { .mmf
  2438. (p16) LDFPD f56, f59 = [AO5], 2 * SIZE
  2439. nop __LINE__
  2440. (p18) FMA f16 = f106, f58, f16
  2441. }
  2442. { .mmf
  2443. nop __LINE__
  2444. nop __LINE__
  2445. (p15) FMA f17 = f109, f61, f17
  2446. }
  2447. ;;
  2448. { .mmf
  2449. (p16) LDFD f62 = [AO6], 2 * SIZE
  2450. (p17) LDFD f66 = [AO61], 2 * SIZE
  2451. (p18) FMA f18 = f106, f64, f18
  2452. }
  2453. { .mmf
  2454. nop __LINE__
  2455. (p17) adds I = -2, I
  2456. (p15) FMA f19 = f109, f67, f19
  2457. }
  2458. ;;
  2459. { .mmf
  2460. (p16) LDFPD f68, f71 = [AO7], 2 * SIZE
  2461. nop __LINE__
  2462. (p18) FMA f20 = f106, f70, f20
  2463. }
  2464. { .mmf
  2465. nop __LINE__
  2466. nop __LINE__
  2467. (p15) FMA f21 = f109, f73, f21
  2468. }
  2469. ;;
  2470. { .mmf
  2471. (p16) LDFD f74 = [AO8], 2 * SIZE
  2472. (p17) LDFD f78 = [AO81], 2 * SIZE
  2473. (p15) FMA f23 = f109, f79, f23
  2474. }
  2475. { .mfb
  2476. (p17) cmp.ne.unc p15, p0 = -1, I
  2477. (p18) FMA f22 = f106, f76, f22
  2478. br.ctop.sptk.few .L116
  2479. }
  2480. ;;
  2481. .L118:
  2482. { .mmf
  2483. mov AO1 = CO
  2484. LDFD f32 = [CO], INCY
  2485. FADD f8 = f8, f9
  2486. }
  2487. ;;
  2488. { .mmf
  2489. LDFD f33 = [CO], INCY
  2490. nop __LINE__
  2491. FADD f10 = f10, f11
  2492. }
  2493. ;;
  2494. { .mmf
  2495. LDFD f34 = [CO], INCY
  2496. nop __LINE__
  2497. FADD f12 = f12, f13
  2498. }
  2499. ;;
  2500. { .mmf
  2501. LDFD f35 = [CO], INCY
  2502. nop __LINE__
  2503. FADD f14 = f14, f15
  2504. }
  2505. ;;
  2506. { .mmf
  2507. LDFD f36 = [CO], INCY
  2508. nop __LINE__
  2509. FADD f16 = f16, f17
  2510. }
  2511. ;;
  2512. { .mmf
  2513. LDFD f37 = [CO], INCY
  2514. nop __LINE__
  2515. FADD f18 = f18, f19
  2516. }
  2517. ;;
  2518. { .mmf
  2519. LDFD f38 = [CO], INCY
  2520. nop __LINE__
  2521. FADD f20 = f20, f21
  2522. }
  2523. ;;
  2524. { .mmf
  2525. LDFD f39 = [CO], INCY
  2526. nop __LINE__
  2527. FADD f22 = f22, f23
  2528. }
  2529. ;;
  2530. { .mmf
  2531. nop __LINE__
  2532. nop __LINE__
  2533. FMA f32 = ALPHA, f8, f32
  2534. }
  2535. { .mmf
  2536. nop __LINE__
  2537. nop __LINE__
  2538. FMA f33 = ALPHA, f10, f33
  2539. }
  2540. { .mmf
  2541. nop __LINE__
  2542. nop __LINE__
  2543. FMA f34 = ALPHA, f12, f34
  2544. }
  2545. { .mmf
  2546. nop __LINE__
  2547. nop __LINE__
  2548. FMA f35 = ALPHA, f14, f35
  2549. }
  2550. ;;
  2551. { .mmf
  2552. STFD [AO1] = f32
  2553. add AO1 = AO1, INCY
  2554. FMA f36 = ALPHA, f16, f36
  2555. }
  2556. ;;
  2557. { .mmf
  2558. STFD [AO1] = f33
  2559. add AO1 = AO1, INCY
  2560. FMA f37 = ALPHA, f18, f37
  2561. }
  2562. ;;
  2563. { .mmf
  2564. STFD [AO1] = f34
  2565. add AO1 = AO1, INCY
  2566. FMA f38 = ALPHA, f20, f38
  2567. }
  2568. ;;
  2569. { .mmf
  2570. STFD [AO1] = f35
  2571. add AO1 = AO1, INCY
  2572. FMA f39 = ALPHA, f22, f39
  2573. }
  2574. ;;
  2575. { .mmi
  2576. STFD [AO1] = f36
  2577. add AO1 = AO1, INCY
  2578. adds J = -1, J
  2579. }
  2580. ;;
  2581. { .mmi
  2582. STFD [AO1] = f37
  2583. add AO1 = AO1, INCY
  2584. nop __LINE__
  2585. }
  2586. ;;
  2587. { .mmi
  2588. STFD [AO1] = f38
  2589. add AO1 = AO1, INCY
  2590. cmp4.lt p6, p0 = 0, J
  2591. }
  2592. ;;
  2593. { .mib
  2594. STFD [AO1] = f39
  2595. add AO1 = AO1, INCY
  2596. (p6) br.cond.dptk .L111
  2597. }
  2598. ;;
  2599. .align 16
  2600. .L120:
  2601. { .mfi
  2602. mov AO1 = A
  2603. mov f8 = f0
  2604. mov pr.rot= 0
  2605. }
  2606. { .mfi
  2607. add AO2 = LDA, A
  2608. mov f10 = f0
  2609. tbit.z p6, p0 = N, 2
  2610. }
  2611. ;;
  2612. { .mfi
  2613. shladd AO3 = LDA, 1, A
  2614. mov f12 = f0
  2615. shr I = MIN_M, 4
  2616. }
  2617. { .mfb
  2618. shladd AO4 = LDA, 1, AO2
  2619. mov f14 = f0
  2620. (p6) br.cond.dpnt .L130
  2621. }
  2622. ;;
  2623. { .mmf
  2624. (p8) LDFD f32 = [AO1], SIZE
  2625. (p8) LDFD f33 = [AO2], SIZE
  2626. mov f9 = f0
  2627. }
  2628. { .mmf
  2629. mov BO = BUFFER
  2630. shladd A = LDA, 2, A
  2631. mov f11 = f0
  2632. }
  2633. ;;
  2634. { .mmf
  2635. (p8) LDFD f40 = [BO], 2 * SIZE
  2636. cmp.eq p6, p0 = 0, I
  2637. mov f13 = f0
  2638. }
  2639. { .mmf
  2640. (p8) LDFD f34 = [AO3], SIZE
  2641. (p8) LDFD f35 = [AO4], SIZE
  2642. mov f15 = f0
  2643. }
  2644. ;;
  2645. { .mmi
  2646. adds RPRE1 = RPREFETCH * SIZE, AO1
  2647. adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2
  2648. mov ar.ec= 2
  2649. }
  2650. { .mmi
  2651. cmp.eq p16, p0 = r0, r0
  2652. add I = I, I
  2653. adds AO21 = 7 * SIZE, AO2
  2654. }
  2655. ;;
  2656. { .mmf
  2657. adds WPRE = 4 * SIZE, CO
  2658. adds PREB = RPREFETCH * SIZE, BO
  2659. (p8) FMPY f8 = f40, f32
  2660. }
  2661. { .mmf
  2662. adds RPRE3 = RPREFETCH * SIZE, AO3
  2663. adds I = -1, I
  2664. (p8) FMPY f10 = f40, f33
  2665. }
  2666. ;;
  2667. { .mfi
  2668. adds AO41 = 7 * SIZE, AO4
  2669. (p8) FMPY f12 = f40, f34
  2670. mov ar.lc = I
  2671. }
  2672. { .mfb
  2673. adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4
  2674. (p8) FMPY f14 = f40, f35
  2675. (p6) br.cond.dpnt .L125
  2676. }
  2677. ;;
  2678. .align 16
  2679. .L122:
  2680. { .mmf
  2681. (p17) LDFPD f72, f87 = [AO4]
  2682. (p17) LDFPD f110, f111 = [BO], 2 * SIZE
  2683. (p17) FMA f8 = f104, f33, f8
  2684. }
  2685. { .mfi
  2686. (p17) adds AO4 = 3 * SIZE, AO4
  2687. (p17) FMA f9 = f105, f34, f9
  2688. (p16) tbit.nz.unc p14, p15 = I, 0
  2689. }
  2690. ;;
  2691. { .mmf
  2692. (p14) PREFETCH [RPRE1], 16 * SIZE
  2693. (p16) LDFPD f32, f33 = [AO1], 2 * SIZE
  2694. (p17) FMA f10 = f104, f35, f10
  2695. }
  2696. { .mmf
  2697. nop __LINE__
  2698. nop __LINE__
  2699. (p17) FMA f11 = f105, f36, f11
  2700. }
  2701. ;;
  2702. { .mmf
  2703. (p15) PREFETCH [RPRE2], 16 * SIZE
  2704. (p16) LDFD f34 = [AO2], 1 * SIZE
  2705. (p17) FMA f12 = f104, f37, f12
  2706. }
  2707. { .mmf
  2708. (p17) LDFD f84 = [AO21], 8 * SIZE
  2709. nop __LINE__
  2710. (p17) FMA f13 = f105, f38, f13
  2711. }
  2712. ;;
  2713. { .mmf
  2714. (p14) PREFETCH [RPRE3], 16 * SIZE
  2715. (p16) LDFPD f36, f37 = [AO3], 2 * SIZE
  2716. (p17) FMA f14 = f104, f39, f14
  2717. }
  2718. { .mmf
  2719. nop __LINE__
  2720. nop __LINE__
  2721. (p17) FMA f15 = f105, f40, f15
  2722. }
  2723. ;;
  2724. { .mmf
  2725. (p15) PREFETCH [RPRE4], 16 * SIZE
  2726. (p16) LDFD f38 = [AO4], 1 * SIZE
  2727. (p17) FMA f8 = f106, f49, f8
  2728. }
  2729. { .mmf
  2730. (p17) LDFD f88 = [AO41], 8 * SIZE
  2731. nop __LINE__
  2732. (p17) FMA f9 = f107, f50, f9
  2733. }
  2734. ;;
  2735. { .mmf
  2736. (p14) PREFETCH [PREB], 16 * SIZE
  2737. (p16) LDFPD f48, f49 = [AO1], 2 * SIZE
  2738. (p17) FMA f10 = f106, f51, f10
  2739. }
  2740. { .mmf
  2741. nop __LINE__
  2742. nop __LINE__
  2743. (p17) FMA f11 = f107, f52, f11
  2744. }
  2745. ;;
  2746. { .mmf
  2747. (p16) LDFPD f35, f50 = [AO2], 2 * SIZE
  2748. (p16) LDFPD f103, f104 = [BO], 2 * SIZE
  2749. (p17) FMA f12 = f106, f53, f12
  2750. }
  2751. { .mmf
  2752. nop __LINE__
  2753. nop __LINE__
  2754. (p17) FMA f13 = f107, f54, f13
  2755. }
  2756. ;;
  2757. { .mmf
  2758. (p16) LDFPD f52, f53 = [AO3], 2 * SIZE
  2759. nop __LINE__
  2760. (p17) FMA f14 = f106, f55, f14
  2761. }
  2762. { .mmf
  2763. nop __LINE__
  2764. nop __LINE__
  2765. (p17) FMA f15 = f107, f56, f15
  2766. }
  2767. ;;
  2768. { .mmf
  2769. (p16) LDFPD f39, f54 = [AO4], 2 * SIZE
  2770. (p16) LDFPD f105, f106 = [BO], 2 * SIZE
  2771. (p17) FMA f8 = f108, f65, f8
  2772. }
  2773. { .mmf
  2774. nop __LINE__
  2775. nop __LINE__
  2776. (p17) FMA f9 = f109, f66, f9
  2777. }
  2778. ;;
  2779. { .mmf
  2780. (p16) LDFPD f64, f65 = [AO1], 2 * SIZE
  2781. nop __LINE__
  2782. (p17) FMA f10 = f108, f67, f10
  2783. }
  2784. { .mmf
  2785. nop __LINE__
  2786. nop __LINE__
  2787. (p17) FMA f11 = f109, f68, f11
  2788. }
  2789. ;;
  2790. { .mmf
  2791. (p16) LDFPD f51, f66 = [AO2], 2 * SIZE
  2792. nop __LINE__
  2793. (p17) FMA f12 = f108, f69, f12
  2794. }
  2795. { .mmf
  2796. nop __LINE__
  2797. nop __LINE__
  2798. (p17) FMA f13 = f109, f70, f13
  2799. }
  2800. ;;
  2801. { .mmf
  2802. (p16) LDFPD f68, f69 = [AO3], 2 * SIZE
  2803. nop __LINE__
  2804. (p17) FMA f14 = f108, f71, f14
  2805. }
  2806. { .mmf
  2807. nop __LINE__
  2808. nop __LINE__
  2809. (p17) FMA f15 = f109, f72, f15
  2810. }
  2811. ;;
  2812. { .mmf
  2813. (p16) LDFPD f55, f70 = [AO4], 2 * SIZE
  2814. (p16) LDFPD f107, f108 = [BO], 2 * SIZE
  2815. (p17) FMA f8 = f110, f81, f8
  2816. }
  2817. { .mmf
  2818. nop __LINE__
  2819. nop __LINE__
  2820. (p17) FMA f9 = f111, f82, f9
  2821. }
  2822. ;;
  2823. { .mmf
  2824. (p16) LDFPD f80, f81 = [AO1], 2 * SIZE
  2825. nop __LINE__
  2826. (p17) FMA f10 = f110, f83, f10
  2827. }
  2828. { .mmf
  2829. nop __LINE__
  2830. nop __LINE__
  2831. (p17) FMA f11 = f111, f84, f11
  2832. }
  2833. ;;
  2834. { .mmf
  2835. (p16) LDFPD f67, f82 = [AO2]
  2836. nop __LINE__
  2837. (p17) FMA f12 = f110, f85, f12
  2838. }
  2839. { .mmf
  2840. nop __LINE__
  2841. (p16) adds AO2 = 3 * SIZE, AO2
  2842. (p17) FMA f13 = f111, f86, f13
  2843. }
  2844. ;;
  2845. { .mmf
  2846. (p16) LDFPD f84, f85 = [AO3], 2 * SIZE
  2847. nop __LINE__
  2848. (p17) FMA f14 = f110, f87, f14
  2849. }
  2850. { .mfb
  2851. adds I = -1, I
  2852. (p17) FMA f15 = f111, f88, f15
  2853. br.ctop.sptk.few .L122
  2854. }
  2855. ;;
  2856. .align 16
  2857. .L125:
  2858. and I = 15, MIN_M
  2859. mov pr.rot= 0
  2860. ;;
  2861. cmp.eq p6, p0 = 0, I
  2862. cmp.eq p16, p15 = r0, r0
  2863. ;;
  2864. adds I = 1, I
  2865. adds AO21 = 1 * SIZE, AO2
  2866. adds AO41 = 1 * SIZE, AO4
  2867. ;;
  2868. shr I = I, 1
  2869. ;;
  2870. adds I = -1, I
  2871. ;;
  2872. mov ar.lc = I
  2873. mov ar.ec= 3
  2874. and I = 15, MIN_M
  2875. (p6) br.cond.dpnt .L128
  2876. ;;
  2877. .align 16
  2878. .L126:
  2879. { .mmf
  2880. (p16) LDFPD f104, f107 = [BO], 2 * SIZE
  2881. (p16) LDFPD f32, f35 = [AO1], 2 * SIZE
  2882. (p18) FMA f8 = f106, f34, f8
  2883. }
  2884. { .mmf
  2885. nop __LINE__
  2886. nop __LINE__
  2887. (p15) FMA f9 = f109, f37, f9
  2888. }
  2889. ;;
  2890. { .mmf
  2891. (p17) LDFD f42 = [AO21], 2 * SIZE
  2892. (p16) LDFD f38 = [AO2], 2 * SIZE
  2893. (p18) FMA f10 = f106, f40, f10
  2894. }
  2895. { .mmf
  2896. nop __LINE__
  2897. nop __LINE__
  2898. (p15) FMA f11 = f109, f43, f11
  2899. }
  2900. ;;
  2901. { .mmf
  2902. (p16) LDFPD f44, f47 = [AO3], 2 * SIZE
  2903. nop __LINE__
  2904. (p18) FMA f12 = f106, f46, f12
  2905. }
  2906. { .mmf
  2907. nop __LINE__
  2908. (p17) adds I = -2, I
  2909. (p15) FMA f13 = f109, f49, f13
  2910. }
  2911. ;;
  2912. { .mmf
  2913. (p17) LDFD f54 = [AO41], 2 * SIZE
  2914. (p16) LDFD f50 = [AO4], 2 * SIZE
  2915. (p15) FMA f15 = f109, f55, f15
  2916. }
  2917. { .mfb
  2918. (p17) cmp.ne.unc p15, p0 = -1, I
  2919. (p18) FMA f14 = f106, f52, f14
  2920. br.ctop.sptk.few .L126
  2921. }
  2922. ;;
  2923. .L128:
  2924. { .mmf
  2925. mov AO1 = CO
  2926. LDFD f32 = [CO], INCY
  2927. FADD f8 = f8, f9
  2928. }
  2929. ;;
  2930. { .mmf
  2931. LDFD f33 = [CO], INCY
  2932. nop __LINE__
  2933. FADD f10 = f10, f11
  2934. }
  2935. ;;
  2936. { .mmf
  2937. LDFD f34 = [CO], INCY
  2938. nop __LINE__
  2939. FADD f12 = f12, f13
  2940. }
  2941. ;;
  2942. { .mmf
  2943. LDFD f35 = [CO], INCY
  2944. nop __LINE__
  2945. FADD f14 = f14, f15
  2946. }
  2947. ;;
  2948. { .mmf
  2949. nop __LINE__
  2950. nop __LINE__
  2951. FMA f32 = ALPHA, f8, f32
  2952. }
  2953. { .mmf
  2954. nop __LINE__
  2955. nop __LINE__
  2956. FMA f33 = ALPHA, f10, f33
  2957. }
  2958. { .mmf
  2959. nop __LINE__
  2960. nop __LINE__
  2961. FMA f34 = ALPHA, f12, f34
  2962. }
  2963. { .mmf
  2964. nop __LINE__
  2965. nop __LINE__
  2966. FMA f35 = ALPHA, f14, f35
  2967. }
  2968. ;;
  2969. { .mmf
  2970. STFD [AO1] = f32
  2971. add AO1 = AO1, INCY
  2972. }
  2973. ;;
  2974. { .mmf
  2975. STFD [AO1] = f33
  2976. add AO1 = AO1, INCY
  2977. }
  2978. ;;
  2979. { .mmf
  2980. STFD [AO1] = f34
  2981. add AO1 = AO1, INCY
  2982. }
  2983. ;;
  2984. { .mmf
  2985. STFD [AO1] = f35
  2986. add AO1 = AO1, INCY
  2987. }
  2988. ;;
  2989. .align 16
  2990. .L130:
  2991. { .mfi
  2992. mov AO1 = A
  2993. mov f8 = f0
  2994. mov pr.rot= 0
  2995. }
  2996. { .mfi
  2997. add AO2 = LDA, A
  2998. mov f10 = f0
  2999. tbit.z p6, p0 = N, 1
  3000. }
  3001. ;;
  3002. { .mfi
  3003. mov BO = BUFFER
  3004. mov f12 = f0
  3005. shr I = MIN_M, 4
  3006. }
  3007. { .mfb
  3008. adds WPRE = 4 * SIZE, CO
  3009. mov f14 = f0
  3010. (p6) br.cond.dpnt .L140
  3011. }
  3012. ;;
  3013. { .mmf
  3014. (p8) LDFD f32 = [AO1], SIZE
  3015. (p8) LDFD f33 = [AO2], SIZE
  3016. mov f9 = f0
  3017. }
  3018. { .mfi
  3019. shladd A = LDA, 1, A
  3020. mov f11 = f0
  3021. mov ar.ec= 2
  3022. }
  3023. ;;
  3024. { .mmf
  3025. (p8) LDFD f40 = [BO], 2 * SIZE
  3026. cmp.eq p6, p0 = 0, I
  3027. mov f13 = f0
  3028. }
  3029. { .mmf
  3030. adds RPRE1 = RPREFETCH * SIZE, AO1
  3031. add I = I, I
  3032. mov f15 = f0
  3033. }
  3034. ;;
  3035. { .mmi
  3036. cmp.eq p16, p0 = r0, r0
  3037. adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2
  3038. adds I = -1, I
  3039. }
  3040. ;;
  3041. { .mfi
  3042. adds AO21 = 7 * SIZE, AO2
  3043. (p8) FMPY f8 = f40, f32
  3044. mov ar.lc = I
  3045. }
  3046. { .mfb
  3047. adds PREB = RPREFETCH * SIZE, BO
  3048. (p8) FMPY f10 = f40, f33
  3049. (p6) br.cond.dpnt .L135
  3050. }
  3051. ;;
  3052. .align 16
  3053. .L132:
  3054. { .mmf
  3055. (p17) LDFPD f68, f83 = [AO2]
  3056. (p17) LDFPD f110, f111 = [BO], 2 * SIZE
  3057. (p17) FMA f8 = f104, f33, f8
  3058. }
  3059. { .mfi
  3060. (p17) adds AO2 = 3 * SIZE, AO2
  3061. (p17) FMA f9 = f105, f34, f9
  3062. (p16) tbit.nz.unc p14, p15 = I, 0
  3063. }
  3064. ;;
  3065. { .mmf
  3066. (p14) PREFETCH [RPRE1], 16 * SIZE
  3067. (p16) LDFPD f32, f33 = [AO1], 2 * SIZE
  3068. (p17) FMA f10 = f104, f35, f10
  3069. }
  3070. { .mmf
  3071. nop __LINE__
  3072. nop __LINE__
  3073. (p17) FMA f11 = f105, f36, f11
  3074. }
  3075. ;;
  3076. { .mmf
  3077. (p15) PREFETCH [RPRE2], 16 * SIZE
  3078. (p16) LDFD f34 = [AO2], 1 * SIZE
  3079. (p17) FMA f8 = f106, f49, f8
  3080. }
  3081. { .mmf
  3082. (p17) LDFD f84 = [AO21], 8 * SIZE
  3083. nop __LINE__
  3084. (p17) FMA f9 = f107, f50, f9
  3085. }
  3086. ;;
  3087. { .mmf
  3088. (p14) PREFETCH [PREB], 16 * SIZE
  3089. (p16) LDFPD f48, f49 = [AO1], 2 * SIZE
  3090. (p17) FMA f10 = f106, f51, f10
  3091. }
  3092. { .mmf
  3093. nop __LINE__
  3094. nop __LINE__
  3095. (p17) FMA f11 = f107, f52, f11
  3096. }
  3097. ;;
  3098. { .mmf
  3099. (p16) LDFPD f35, f50 = [AO2], 2 * SIZE
  3100. (p16) LDFPD f103, f104 = [BO], 2 * SIZE
  3101. (p17) FMA f8 = f108, f65, f8
  3102. }
  3103. { .mmf
  3104. nop __LINE__
  3105. nop __LINE__
  3106. (p17) FMA f9 = f109, f66, f9
  3107. }
  3108. ;;
  3109. { .mmf
  3110. (p16) LDFPD f105, f106 = [BO], 2 * SIZE
  3111. (p16) LDFPD f64, f65 = [AO1], 2 * SIZE
  3112. (p17) FMA f10 = f108, f67, f10
  3113. }
  3114. { .mmf
  3115. nop __LINE__
  3116. nop __LINE__
  3117. (p17) FMA f11 = f109, f68, f11
  3118. }
  3119. ;;
  3120. { .mmf
  3121. (p16) LDFPD f51, f66 = [AO2], 2 * SIZE
  3122. (p16) LDFPD f107, f108 = [BO], 2 * SIZE
  3123. (p17) FMA f8 = f110, f81, f8
  3124. }
  3125. { .mmf
  3126. nop __LINE__
  3127. nop __LINE__
  3128. (p17) FMA f9 = f111, f82, f9
  3129. }
  3130. ;;
  3131. { .mmf
  3132. (p16) LDFPD f80, f81 = [AO1], 2 * SIZE
  3133. nop __LINE__
  3134. (p17) FMA f10 = f110, f83, f10
  3135. }
  3136. { .mfb
  3137. adds I = -1, I
  3138. (p17) FMA f11 = f111, f84, f11
  3139. br.ctop.sptk.few .L132
  3140. }
  3141. ;;
  3142. .align 16
  3143. .L135:
  3144. and I = 15, MIN_M
  3145. ;;
  3146. cmp.eq p6, p0 = 0, I
  3147. (p6) br.cond.dpnt .L138
  3148. ;;
  3149. tbit.nz p12, p0 = MIN_M, 3
  3150. tbit.nz p13, p0 = MIN_M, 2
  3151. tbit.nz p14, p0 = MIN_M, 1
  3152. tbit.nz p15, p0 = MIN_M, 0
  3153. ;;
  3154. (p12) LDFPD f100, f101 = [BO], 2 * SIZE
  3155. (p12) LDFPD f32, f33 = [AO1], 2 * SIZE
  3156. (p12) LDFD f34 = [AO2], 1 * SIZE
  3157. ;;
  3158. (p12) LDFPD f36, f37 = [AO1], 2 * SIZE
  3159. (p12) LDFPD f35, f38 = [AO2], 2 * SIZE
  3160. ;;
  3161. (p12) LDFPD f102, f103 = [BO], 2 * SIZE
  3162. (p12) LDFPD f39, f42 = [AO2], 2 * SIZE
  3163. ;;
  3164. (p12) LDFPD f40, f41 = [AO1], 2 * SIZE
  3165. (p12) LDFPD f43, f46 = [AO2], 2 * SIZE
  3166. ;;
  3167. (p12) LDFPD f104, f105 = [BO], 2 * SIZE
  3168. (p12) LDFPD f44, f45 = [AO1], 2 * SIZE
  3169. (p12) LDFD f47 = [AO2], 1 * SIZE
  3170. ;;
  3171. (p12) LDFPD f106, f107 = [BO], 2 * SIZE
  3172. (p13) LDFD f50 = [AO2], 1 * SIZE
  3173. (p13) LDFPD f48, f49 = [AO1], 2 * SIZE
  3174. ;;
  3175. (p13) LDFPD f108, f109 = [BO], 2 * SIZE
  3176. (p13) LDFPD f51, f54 = [AO2], 2 * SIZE
  3177. ;;
  3178. (p13) LDFPD f110, f111 = [BO], 2 * SIZE
  3179. (p13) LDFPD f52, f53 = [AO1], 2 * SIZE
  3180. (p13) LDFD f55 = [AO2], 1 * SIZE
  3181. ;;
  3182. (p14) LDFPD f56, f57 = [AO1], 2 * SIZE
  3183. (p14) LDFD f58 = [AO2], 1 * SIZE
  3184. ;;
  3185. (p14) LDFPD f112, f113 = [BO], 2 * SIZE
  3186. (p15) LDFD f60 = [AO1]
  3187. (p14) LDFD f59 = [AO2], 1 * SIZE
  3188. ;;
  3189. (p15) LDFD f61 = [AO2]
  3190. (p15) LDFD f114 = [BO]
  3191. ;;
  3192. (p12) FMA f8 = f100, f32, f8
  3193. (p12) FMA f9 = f101, f33, f9
  3194. (p12) FMA f10 = f100, f34, f10
  3195. (p12) FMA f11 = f101, f35, f11
  3196. ;;
  3197. (p12) FMA f12 = f102, f36, f12
  3198. (p12) FMA f13 = f103, f37, f13
  3199. (p12) FMA f14 = f102, f38, f14
  3200. (p12) FMA f15 = f103, f39, f15
  3201. ;;
  3202. (p12) FMA f8 = f104, f40, f8
  3203. (p12) FMA f9 = f105, f41, f9
  3204. (p12) FMA f10 = f104, f42, f10
  3205. (p12) FMA f11 = f105, f43, f11
  3206. ;;
  3207. (p12) FMA f12 = f106, f44, f12
  3208. (p12) FMA f13 = f107, f45, f13
  3209. (p12) FMA f14 = f106, f46, f14
  3210. (p12) FMA f15 = f107, f47, f15
  3211. ;;
  3212. (p13) FMA f8 = f108, f48, f8
  3213. (p13) FMA f9 = f109, f49, f9
  3214. (p13) FMA f10 = f108, f50, f10
  3215. (p13) FMA f11 = f109, f51, f11
  3216. ;;
  3217. (p13) FMA f12 = f110, f52, f12
  3218. (p13) FMA f13 = f111, f53, f13
  3219. (p13) FMA f14 = f110, f54, f14
  3220. (p13) FMA f15 = f111, f55, f15
  3221. ;;
  3222. (p14) FMA f8 = f112, f56, f8
  3223. (p14) FMA f9 = f113, f57, f9
  3224. (p14) FMA f10 = f112, f58, f10
  3225. (p14) FMA f11 = f113, f59, f11
  3226. ;;
  3227. (p15) FMA f12 = f114, f60, f12
  3228. (p15) FMA f14 = f114, f61, f14
  3229. ;;
  3230. .L138:
  3231. FADD f8 = f8, f9
  3232. FADD f10 = f10, f11
  3233. FADD f12 = f12, f13
  3234. FADD f14 = f14, f15
  3235. ;;
  3236. FADD f8 = f8, f12
  3237. FADD f10 = f10, f14
  3238. ;;
  3239. { .mmf
  3240. mov AO1 = CO
  3241. LDFD f32 = [CO], INCY
  3242. }
  3243. ;;
  3244. { .mmf
  3245. LDFD f33 = [CO], INCY
  3246. nop __LINE__
  3247. }
  3248. ;;
  3249. { .mmf
  3250. nop __LINE__
  3251. nop __LINE__
  3252. FMA f32 = ALPHA, f8, f32
  3253. }
  3254. { .mmf
  3255. nop __LINE__
  3256. nop __LINE__
  3257. FMA f33 = ALPHA, f10, f33
  3258. }
  3259. ;;
  3260. { .mmf
  3261. STFD [AO1] = f32
  3262. add AO1 = AO1, INCY
  3263. }
  3264. ;;
  3265. { .mmf
  3266. STFD [AO1] = f33
  3267. }
  3268. ;;
  3269. .align 16
  3270. .L140:
  3271. { .mfi
  3272. mov AO1 = A
  3273. mov f8 = f0
  3274. shr I = MIN_M, 4
  3275. }
  3276. { .mfi
  3277. mov BO = BUFFER
  3278. mov f10 = f0
  3279. tbit.z p7, p0 = N, 0
  3280. }
  3281. ;;
  3282. { .mfi
  3283. cmp.eq p6, p0 = 0, I
  3284. mov f12 = f0
  3285. mov pr.rot= 0
  3286. }
  3287. { .mfb
  3288. add I = I, I
  3289. mov f14 = f0
  3290. (p7) br.cond.dpnt .L199
  3291. }
  3292. ;;
  3293. { .mfi
  3294. (p8) LDFD f32 = [AO1], SIZE
  3295. mov f9 = f0
  3296. mov ar.ec= 2
  3297. }
  3298. { .mmf
  3299. (p8) LDFD f40 = [BO], 2 * SIZE
  3300. add A = A, LDA
  3301. mov f11 = f0
  3302. }
  3303. ;;
  3304. { .mmf
  3305. adds WPRE = 1 * SIZE, CO
  3306. adds PREB = RPREFETCH * SIZE, BO
  3307. mov f13 = f0
  3308. }
  3309. { .mmf
  3310. cmp.eq p16, p0 = r0, r0
  3311. adds I = -1, I
  3312. mov f15 = f0
  3313. }
  3314. ;;
  3315. { .mfi
  3316. lfetch.excl.nt1 [WPRE]
  3317. (p8) FMPY f8 = f40, f32
  3318. mov ar.lc = I
  3319. }
  3320. { .mmb
  3321. nop __LINE__
  3322. nop __LINE__
  3323. (p6) br.cond.dpnt .L145
  3324. }
  3325. ;;
  3326. .align 16
  3327. .L142:
  3328. { .mmf
  3329. (p17) LDFPD f81, f82 = [AO1], 2 * SIZE
  3330. (p17) LDFPD f110, f111 = [BO], 2 * SIZE
  3331. (p17) FMA f8 = f104, f33, f8
  3332. }
  3333. { .mfi
  3334. nop __LINE__
  3335. (p17) FMA f9 = f105, f34, f9
  3336. (p16) tbit.nz.unc p14, p15 = I, 0
  3337. }
  3338. ;;
  3339. { .mmf
  3340. (p16) LDFPD f32, f33 = [AO1], 2 * SIZE
  3341. (p16) LDFPD f103, f104 = [BO], 2 * SIZE
  3342. (p17) FMA f8 = f106, f49, f8
  3343. }
  3344. { .mmf
  3345. nop __LINE__
  3346. nop __LINE__
  3347. (p17) FMA f9 = f107, f50, f9
  3348. }
  3349. ;;
  3350. { .mmf
  3351. (p16) LDFPD f105, f106 = [BO], 2 * SIZE
  3352. (p16) LDFPD f48, f49 = [AO1], 2 * SIZE
  3353. (p17) FMA f8 = f108, f65, f8
  3354. }
  3355. { .mmf
  3356. nop __LINE__
  3357. nop __LINE__
  3358. (p17) FMA f9 = f109, f66, f9
  3359. }
  3360. ;;
  3361. { .mmf
  3362. (p16) LDFPD f64, f65 = [AO1], 2 * SIZE
  3363. (p16) LDFPD f107, f108 = [BO], 2 * SIZE
  3364. (p17) FMA f8 = f110, f81, f8
  3365. }
  3366. { .mfb
  3367. adds I = -1, I
  3368. (p17) FMA f9 = f111, f82, f9
  3369. br.ctop.sptk.few .L142
  3370. }
  3371. ;;
  3372. .align 16
  3373. .L145:
  3374. and I = 15, MIN_M
  3375. ;;
  3376. cmp.eq p6, p0 = 0, I
  3377. (p6) br.cond.dpnt .L148
  3378. ;;
  3379. tbit.nz p12, p0 = MIN_M, 3
  3380. tbit.nz p13, p0 = MIN_M, 2
  3381. tbit.nz p14, p0 = MIN_M, 1
  3382. tbit.nz p15, p0 = MIN_M, 0
  3383. ;;
  3384. (p12) LDFPD f32, f33 = [AO1], 2 * SIZE
  3385. (p12) LDFPD f100, f101 = [BO], 2 * SIZE
  3386. ;;
  3387. (p12) LDFPD f36, f37 = [AO1], 2 * SIZE
  3388. (p12) LDFPD f102, f103 = [BO], 2 * SIZE
  3389. ;;
  3390. (p12) LDFPD f40, f41 = [AO1], 2 * SIZE
  3391. (p12) LDFPD f104, f105 = [BO], 2 * SIZE
  3392. ;;
  3393. (p12) LDFPD f44, f45 = [AO1], 2 * SIZE
  3394. (p12) LDFPD f106, f107 = [BO], 2 * SIZE
  3395. ;;
  3396. (p13) LDFPD f48, f49 = [AO1], 2 * SIZE
  3397. (p13) LDFPD f108, f109 = [BO], 2 * SIZE
  3398. ;;
  3399. (p13) LDFPD f52, f53 = [AO1], 2 * SIZE
  3400. (p13) LDFPD f110, f111 = [BO], 2 * SIZE
  3401. ;;
  3402. (p14) LDFPD f56, f57 = [AO1], 2 * SIZE
  3403. (p14) LDFPD f112, f113 = [BO], 2 * SIZE
  3404. ;;
  3405. (p15) LDFD f60 = [AO1]
  3406. (p15) LDFD f114 = [BO]
  3407. ;;
  3408. (p12) FMA f8 = f100, f32, f8
  3409. (p12) FMA f9 = f101, f33, f9
  3410. (p12) FMA f10 = f102, f36, f10
  3411. (p12) FMA f11 = f103, f37, f11
  3412. (p12) FMA f12 = f104, f40, f12
  3413. (p12) FMA f13 = f105, f41, f13
  3414. (p12) FMA f14 = f106, f44, f14
  3415. (p12) FMA f15 = f107, f45, f15
  3416. ;;
  3417. (p13) FMA f8 = f108, f48, f8
  3418. (p13) FMA f9 = f109, f49, f9
  3419. (p13) FMA f10 = f110, f52, f10
  3420. (p13) FMA f11 = f111, f53, f11
  3421. (p14) FMA f12 = f112, f56, f12
  3422. (p14) FMA f13 = f113, f57, f13
  3423. (p15) FMA f14 = f114, f60, f14
  3424. ;;
  3425. .L148:
  3426. { .mmf
  3427. LDFD f32 = [CO]
  3428. nop __LINE__
  3429. FADD f8 = f8, f9
  3430. }
  3431. { .mmf
  3432. nop __LINE__
  3433. nop __LINE__
  3434. FADD f10 = f10, f11
  3435. }
  3436. ;;
  3437. { .mmf
  3438. nop __LINE__
  3439. nop __LINE__
  3440. FADD f12 = f12, f13
  3441. }
  3442. { .mmf
  3443. nop __LINE__
  3444. nop __LINE__
  3445. FADD f14 = f14, f15
  3446. }
  3447. ;;
  3448. { .mmf
  3449. nop __LINE__
  3450. nop __LINE__
  3451. FADD f8 = f8, f12
  3452. }
  3453. { .mmf
  3454. nop __LINE__
  3455. nop __LINE__
  3456. FADD f10 = f10, f14
  3457. }
  3458. ;;
  3459. { .mmf
  3460. nop __LINE__
  3461. nop __LINE__
  3462. FADD f8 = f8, f10
  3463. }
  3464. ;;
  3465. { .mmf
  3466. nop __LINE__
  3467. nop __LINE__
  3468. FMA f32 = ALPHA, f8, f32
  3469. }
  3470. ;;
  3471. { .mmf
  3472. STFD [CO] = f32
  3473. nop __LINE__
  3474. nop __LINE__
  3475. }
  3476. ;;
  3477. .align 16
  3478. .L199:
  3479. adds IS = P, IS
  3480. shladd A = LDAP, BASE_SHIFT, A
  3481. ;;
  3482. cmp.gt p6, p0 = M, IS
  3483. (p6) br.cond.dptk .LIs_loop
  3484. .align 4
  3485. ;;
  3486. .L999:
  3487. mov r8 = r0
  3488. adds r9 = 1 * 16, SP
  3489. ;;
  3490. ldf.fill f16 = [SP], 32
  3491. ldf.fill f17 = [r9], 32
  3492. mov ar.lc = ARLC
  3493. ;;
  3494. ldf.fill f18 = [SP], 32
  3495. ldf.fill f19 = [r9], 32
  3496. mov pr = PR, -1
  3497. ;;
  3498. ldf.fill f20 = [SP], 32
  3499. ldf.fill f21 = [r9], 32
  3500. mov ar.pfs = ARPFS
  3501. ;;
  3502. ldf.fill f22 = [SP], 32
  3503. ldf.fill f23 = [r9]
  3504. br.ret.sptk.many b0
  3505. ;;
  3506. EPILOGUE