You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qgemv_t.S 28 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define SP r12
  41. #define M r32
  42. #define N r33
  43. #ifndef XDOUBLE
  44. #define A r36
  45. #define LDA r37
  46. #define X1 r38
  47. #define INCX r39
  48. #define Y1 r34
  49. #define INCY r35
  50. #else
  51. #define A r38
  52. #define LDA r39
  53. #define X1 r34
  54. #define INCX r35
  55. #define Y1 r36
  56. #define INCY r37
  57. #endif
  58. #define BUFFER r11
  59. #define I r15
  60. #define J r16
  61. #define AO1 r17
  62. #define AO2 r18
  63. #define AO3 r19
  64. #define AO4 r20
  65. #define AO5 r21
  66. #define AO6 r22
  67. #define AO7 r23
  68. #define AO8 r24
  69. #define X2 r25
  70. #define Y2 r26
  71. #define LDA7M8 r27
  72. #define INCX5 r28
  73. #define INCY5 r29
  74. #define YY1 r8
  75. #define YY2 r9
  76. #define ARLC r30
  77. #define PR r31
  78. #ifdef DOUBLE
  79. #define RPREFETCH (16 * 3 + 8)
  80. #else
  81. #define RPREFETCH (16 * 3 + 16)
  82. #endif
  83. #define PREFETCH lfetch.nt1
  84. #define ALPHA f6
  85. PROLOGUE
  86. .prologue
  87. PROFCODE
  88. { .mmi
  89. mov ARLC = ar.lc
  90. }
  91. { .mmi
  92. adds r15 = 24, SP
  93. adds r14 = 16, SP
  94. }
  95. ;;
  96. #ifdef XDOUBLE
  97. ld8 X1 = [r14], 16
  98. ld8 INCX = [r15], 16
  99. ;;
  100. #endif
  101. ld8 Y1 = [r14], 16
  102. ld8 INCY = [r15], 16
  103. ;;
  104. ld8 BUFFER = [r14]
  105. ;;
  106. mov PR = pr
  107. ;;
  108. mov ALPHA = f8
  109. .body
  110. ;;
  111. { .mmi
  112. cmp.ge p7, p0 = r0, M
  113. cmp.ge p6, p0 = r0, N
  114. }
  115. ;;
  116. { .mmi
  117. shladd INCX = INCX, BASE_SHIFT, r0
  118. shladd INCY = INCY, BASE_SHIFT, r0
  119. shladd LDA = LDA, BASE_SHIFT, r0
  120. }
  121. ;;
  122. { .mbb
  123. (p7) br.cond.dpnt .L999
  124. (p6) br.cond.dpnt .L999
  125. }
  126. .align 16
  127. ;;
  128. shladd INCY5 = INCY, 2, INCY
  129. shladd INCX5 = INCX, 2, INCX
  130. cmp.eq p10, p0 = SIZE, INCX
  131. ;;
  132. (p10) mov BUFFER = X1
  133. (p10) br.cond.dptk .L10
  134. ;;
  135. mov pr.rot= 0
  136. shladd X2 = INCX, 2, X1
  137. mov YY1 = BUFFER
  138. adds YY2 = 4 * SIZE, BUFFER
  139. ;;
  140. shr I = M, 3
  141. ;;
  142. { .mmi
  143. adds I = -1, I
  144. cmp.eq p16, p0 = r0, r0
  145. mov ar.ec= 5
  146. }
  147. ;;
  148. { .mmi
  149. mov ar.lc = I
  150. }
  151. { .mib
  152. cmp.gt p6, p0 = 0, I
  153. tbit.nz p13, p0 = M, 2
  154. (p6) br.cond.dpnt .L05
  155. }
  156. ;;
  157. .align 16
  158. .L01:
  159. (p20) STFD [YY1] = f36, SIZE
  160. (p20) STFD [YY2] = f56, SIZE
  161. (p16) LDFD f32 = [X1], INCX
  162. (p16) LDFD f52 = [X2], INCX
  163. ;;
  164. (p20) STFD [YY1] = f41, SIZE
  165. (p20) STFD [YY2] = f61, SIZE
  166. (p16) LDFD f37 = [X1], INCX
  167. (p16) LDFD f57 = [X2], INCX
  168. ;;
  169. (p20) STFD [YY1] = f46, SIZE
  170. (p20) STFD [YY2] = f66, SIZE
  171. (p16) LDFD f42 = [X1], INCX
  172. (p16) LDFD f62 = [X2], INCX
  173. ;;
  174. (p20) STFD [YY1] = f51, 5 * SIZE
  175. (p20) STFD [YY2] = f71, 5 * SIZE
  176. (p16) LDFD f47 = [X1], INCX5
  177. (p16) LDFD f67 = [X2], INCX5
  178. br.ctop.sptk.few .L01
  179. ;;
  180. .align 16
  181. .L05:
  182. (p13) LDFD f32 = [X1], INCX
  183. tbit.nz p14, p0 = M, 1
  184. ;;
  185. (p13) LDFD f33 = [X1], INCX
  186. tbit.nz p15, p0 = M, 0
  187. ;;
  188. (p13) LDFD f34 = [X1], INCX
  189. ;;
  190. (p13) LDFD f35 = [X1], INCX
  191. ;;
  192. (p14) LDFD f36 = [X1], INCX
  193. ;;
  194. (p13) STFD [YY1] = f32, SIZE
  195. (p14) LDFD f37 = [X1], INCX
  196. ;;
  197. (p13) STFD [YY1] = f33, SIZE
  198. (p15) LDFD f38 = [X1], INCX
  199. ;;
  200. (p13) STFD [YY1] = f34, SIZE
  201. ;;
  202. (p13) STFD [YY1] = f35, SIZE
  203. ;;
  204. (p14) STFD [YY1] = f36, SIZE
  205. ;;
  206. (p14) STFD [YY1] = f37, SIZE
  207. ;;
  208. (p15) STFD [YY1] = f38, SIZE
  209. ;;
  210. .align 16
  211. .L10:
  212. mov YY1 = Y1
  213. shladd Y2 = INCY, 2, Y1
  214. shladd YY2 = INCY, 2, Y1
  215. ;;
  216. { .mmi
  217. nop __LINE__
  218. shr J = N, 3
  219. }
  220. ;;
  221. { .mib
  222. nop __LINE__
  223. cmp.eq p6, p0 = r0, J
  224. (p6) br.cond.dpnt .L20
  225. }
  226. ;;
  227. .align 16
  228. .L11:
  229. mov AO1 = A
  230. adds AO2 = 1 * SIZE, A
  231. adds AO3 = 2 * SIZE, A
  232. adds AO4 = 3 * SIZE, A
  233. adds AO5 = 4 * SIZE, A
  234. adds AO6 = 5 * SIZE, A
  235. adds AO7 = 6 * SIZE, A
  236. adds AO8 = 7 * SIZE, A
  237. shladd A = LDA, 3, A
  238. ;;
  239. shladd LDA7M8 = LDA, 3, r0
  240. ;;
  241. sub LDA7M8 = LDA, LDA7M8
  242. ;;
  243. adds LDA7M8 = 8 * SIZE, LDA7M8
  244. ;;
  245. mov f8 = f0
  246. mov f9 = f0
  247. mov f10 = f0
  248. mov f11 = f0
  249. mov f12 = f0
  250. mov f13 = f0
  251. mov f14 = f0
  252. mov f15 = f0
  253. mov pr.rot= 0
  254. shr I = M, 3
  255. mov ar.ec = 2
  256. ;;
  257. mov X1 = BUFFER
  258. adds X2 = 2 * SIZE, BUFFER
  259. ;;
  260. cmp.eq p16, p0 = r0, r0
  261. ;;
  262. adds I = -1, I
  263. ;;
  264. mov ar.lc = I
  265. cmp.eq p6, p0 = -1, I
  266. (p6) br.cond.dpnt .L15
  267. ;;
  268. .align 16
  269. .L12:
  270. (p16) LDFD f32 = [AO1], LDA
  271. (p16) LDFD f34 = [AO3], LDA
  272. (p16) LDFD f36 = [AO5], LDA
  273. (p16) LDFD f38 = [AO7], LDA
  274. ;;
  275. (p16) LDFD f33 = [AO2], LDA
  276. (p16) LDFD f35 = [AO4], LDA
  277. (p16) LDFD f37 = [AO6], LDA
  278. (p16) LDFD f39 = [AO8], LDA
  279. ;;
  280. (p16) LDFD f40 = [AO1], LDA
  281. (p16) LDFD f42 = [AO3], LDA
  282. (p16) LDFD f44 = [AO5], LDA
  283. (p16) LDFD f46 = [AO7], LDA
  284. ;;
  285. (p16) LDFD f41 = [AO2], LDA
  286. (p16) LDFD f43 = [AO4], LDA
  287. (p16) LDFD f45 = [AO6], LDA
  288. (p16) LDFD f47 = [AO8], LDA
  289. ;;
  290. (p16) LDFD f48 = [AO1], LDA
  291. (p16) LDFD f50 = [AO3], LDA
  292. (p16) LDFD f52 = [AO5], LDA
  293. (p16) LDFD f54 = [AO7], LDA
  294. ;;
  295. (p16) LDFD f49 = [AO2], LDA
  296. (p16) LDFD f51 = [AO4], LDA
  297. (p16) LDFD f53 = [AO6], LDA
  298. (p16) LDFD f55 = [AO8], LDA
  299. ;;
  300. (p16) LDFD f56 = [AO1], LDA
  301. (p16) LDFD f58 = [AO3], LDA
  302. (p16) LDFD f60 = [AO5], LDA
  303. (p16) LDFD f62 = [AO7], LDA
  304. ;;
  305. (p16) LDFD f57 = [AO2], LDA
  306. (p16) LDFD f59 = [AO4], LDA
  307. (p16) LDFD f61 = [AO6], LDA
  308. (p16) LDFD f63 = [AO8], LDA
  309. ;;
  310. (p16) LDFD f64 = [AO1], LDA
  311. (p16) LDFD f66 = [AO3], LDA
  312. (p16) LDFD f68 = [AO5], LDA
  313. (p16) LDFD f70 = [AO7], LDA
  314. ;;
  315. (p16) LDFD f65 = [AO2], LDA
  316. (p16) LDFD f67 = [AO4], LDA
  317. (p16) LDFD f69 = [AO6], LDA
  318. (p16) LDFD f71 = [AO8], LDA
  319. ;;
  320. (p16) LDFD f72 = [AO1], LDA
  321. (p16) LDFD f74 = [AO3], LDA
  322. (p16) LDFD f76 = [AO5], LDA
  323. (p16) LDFD f78 = [AO7], LDA
  324. ;;
  325. (p16) LDFD f73 = [AO2], LDA
  326. (p16) LDFD f75 = [AO4], LDA
  327. (p16) LDFD f77 = [AO6], LDA
  328. (p16) LDFD f79 = [AO8], LDA
  329. ;;
  330. (p16) LDFD f80 = [AO1], LDA
  331. (p16) LDFD f82 = [AO3], LDA
  332. (p16) LDFD f84 = [AO5], LDA
  333. (p16) LDFD f86 = [AO7], LDA
  334. ;;
  335. (p16) LDFD f81 = [AO2], LDA
  336. (p16) LDFD f83 = [AO4], LDA
  337. (p16) LDFD f85 = [AO6], LDA
  338. (p16) LDFD f87 = [AO8], LDA
  339. ;;
  340. (p16) LDFD f88 = [AO1], LDA7M8
  341. (p16) LDFD f90 = [AO3], LDA7M8
  342. (p16) LDFD f92 = [AO5], LDA7M8
  343. (p16) LDFD f94 = [AO7], LDA7M8
  344. ;;
  345. (p16) LDFD f89 = [AO2], LDA7M8
  346. (p16) LDFD f91 = [AO4], LDA7M8
  347. (p16) LDFD f93 = [AO6], LDA7M8
  348. (p16) LDFD f95 = [AO8], LDA7M8
  349. ;;
  350. (p16) LDFD f96 = [X1], 1 * SIZE
  351. (p16) LDFD f98 = [X2], 1 * SIZE
  352. ;;
  353. (p16) LDFD f97 = [X1], 3 * SIZE
  354. (p16) LDFD f99 = [X2], 3 * SIZE
  355. ;;
  356. (p16) LDFD f100 = [X1], 1 * SIZE
  357. (p16) LDFD f102 = [X2], 1 * SIZE
  358. ;;
  359. (p16) LDFD f101 = [X1], 3 * SIZE
  360. (p16) LDFD f103 = [X2], 3 * SIZE
  361. ;;
  362. (p16) FMA f8 = f96, f32, f8
  363. (p16) FMA f9 = f96, f40, f9
  364. (p16) FMA f10 = f96, f48, f10
  365. (p16) FMA f11 = f96, f56, f11
  366. (p16) FMA f12 = f96, f64, f12
  367. (p16) FMA f13 = f96, f72, f13
  368. (p16) FMA f14 = f96, f80, f14
  369. (p16) FMA f15 = f96, f88, f15
  370. ;;
  371. (p16) FMA f8 = f97, f33, f8
  372. (p16) FMA f9 = f97, f41, f9
  373. (p16) FMA f10 = f97, f49, f10
  374. (p16) FMA f11 = f97, f57, f11
  375. (p16) FMA f12 = f97, f65, f12
  376. (p16) FMA f13 = f97, f73, f13
  377. (p16) FMA f14 = f97, f81, f14
  378. (p16) FMA f15 = f97, f89, f15
  379. ;;
  380. (p16) FMA f8 = f98, f34, f8
  381. (p16) FMA f9 = f98, f42, f9
  382. (p16) FMA f10 = f98, f50, f10
  383. (p16) FMA f11 = f98, f58, f11
  384. (p16) FMA f12 = f98, f66, f12
  385. (p16) FMA f13 = f98, f74, f13
  386. (p16) FMA f14 = f98, f82, f14
  387. (p16) FMA f15 = f98, f90, f15
  388. ;;
  389. (p16) FMA f8 = f99, f35, f8
  390. (p16) FMA f9 = f99, f43, f9
  391. (p16) FMA f10 = f99, f51, f10
  392. (p16) FMA f11 = f99, f59, f11
  393. (p16) FMA f12 = f99, f67, f12
  394. (p16) FMA f13 = f99, f75, f13
  395. (p16) FMA f14 = f99, f83, f14
  396. (p16) FMA f15 = f99, f91, f15
  397. ;;
  398. (p16) FMA f8 = f100, f36, f8
  399. (p16) FMA f9 = f100, f44, f9
  400. (p16) FMA f10 = f100, f52, f10
  401. (p16) FMA f11 = f100, f60, f11
  402. (p16) FMA f12 = f100, f68, f12
  403. (p16) FMA f13 = f100, f76, f13
  404. (p16) FMA f14 = f100, f84, f14
  405. (p16) FMA f15 = f100, f92, f15
  406. ;;
  407. (p16) FMA f8 = f101, f37, f8
  408. (p16) FMA f9 = f101, f45, f9
  409. (p16) FMA f10 = f101, f53, f10
  410. (p16) FMA f11 = f101, f61, f11
  411. (p16) FMA f12 = f101, f69, f12
  412. (p16) FMA f13 = f101, f77, f13
  413. (p16) FMA f14 = f101, f85, f14
  414. (p16) FMA f15 = f101, f93, f15
  415. ;;
  416. (p16) FMA f8 = f102, f38, f8
  417. (p16) FMA f9 = f102, f46, f9
  418. (p16) FMA f10 = f102, f54, f10
  419. (p16) FMA f11 = f102, f62, f11
  420. (p16) FMA f12 = f102, f70, f12
  421. (p16) FMA f13 = f102, f78, f13
  422. (p16) FMA f14 = f102, f86, f14
  423. (p16) FMA f15 = f102, f94, f15
  424. ;;
  425. (p16) FMA f8 = f103, f39, f8
  426. (p16) FMA f9 = f103, f47, f9
  427. (p16) FMA f10 = f103, f55, f10
  428. (p16) FMA f11 = f103, f63, f11
  429. (p16) FMA f12 = f103, f71, f12
  430. (p16) FMA f13 = f103, f79, f13
  431. (p16) FMA f14 = f103, f87, f14
  432. (p16) FMA f15 = f103, f95, f15
  433. br.ctop.sptk.few .L12
  434. ;;
  435. .align 16
  436. .L15:
  437. tbit.nz p13, p11 = M, 2
  438. tbit.nz p14, p12 = M, 1
  439. ;;
  440. { .mmi
  441. (p11) adds AO5 = - 4 * SIZE, AO5
  442. }
  443. { .mbb
  444. (p11) adds AO7 = - 4 * SIZE, AO7
  445. }
  446. ;;
  447. { .mmi
  448. (p13) LDFD f32 = [AO1], LDA
  449. (p13) LDFD f34 = [AO3], LDA
  450. tbit.nz p15, p0 = M, 0
  451. }
  452. { .mmi
  453. (p14) LDFD f36 = [AO5], LDA
  454. (p11) adds AO6 = - 4 * SIZE, AO6
  455. (p12) adds AO7 = - 2 * SIZE, AO7
  456. }
  457. ;;
  458. (p13) LDFD f33 = [AO2], LDA
  459. (p13) LDFD f35 = [AO4], LDA
  460. (p14) LDFD f37 = [AO6], LDA
  461. (p15) LDFD f38 = [AO7], LDA
  462. ;;
  463. (p13) LDFD f40 = [AO1], LDA
  464. (p13) LDFD f42 = [AO3], LDA
  465. (p14) LDFD f44 = [AO5], LDA
  466. (p15) LDFD f46 = [AO7], LDA
  467. ;;
  468. (p13) LDFD f41 = [AO2], LDA
  469. (p13) LDFD f43 = [AO4], LDA
  470. (p14) LDFD f45 = [AO6], LDA
  471. ;;
  472. (p13) LDFD f48 = [AO1], LDA
  473. (p13) LDFD f50 = [AO3], LDA
  474. (p14) LDFD f52 = [AO5], LDA
  475. (p15) LDFD f54 = [AO7], LDA
  476. ;;
  477. (p13) LDFD f49 = [AO2], LDA
  478. (p13) LDFD f51 = [AO4], LDA
  479. (p14) LDFD f53 = [AO6], LDA
  480. ;;
  481. (p13) LDFD f56 = [AO1], LDA
  482. (p13) LDFD f58 = [AO3], LDA
  483. (p14) LDFD f60 = [AO5], LDA
  484. (p15) LDFD f62 = [AO7], LDA
  485. ;;
  486. (p13) LDFD f57 = [AO2], LDA
  487. (p13) LDFD f59 = [AO4], LDA
  488. (p14) LDFD f61 = [AO6], LDA
  489. ;;
  490. (p13) LDFD f64 = [AO1], LDA
  491. (p13) LDFD f66 = [AO3], LDA
  492. (p14) LDFD f68 = [AO5], LDA
  493. (p15) LDFD f70 = [AO7], LDA
  494. ;;
  495. (p13) LDFD f65 = [AO2], LDA
  496. (p13) LDFD f67 = [AO4], LDA
  497. (p14) LDFD f69 = [AO6], LDA
  498. ;;
  499. (p13) LDFD f72 = [AO1], LDA
  500. (p13) LDFD f74 = [AO3], LDA
  501. (p14) LDFD f76 = [AO5], LDA
  502. (p15) LDFD f78 = [AO7], LDA
  503. ;;
  504. (p13) LDFD f73 = [AO2], LDA
  505. (p13) LDFD f75 = [AO4], LDA
  506. (p14) LDFD f77 = [AO6], LDA
  507. ;;
  508. (p13) LDFD f80 = [AO1], LDA
  509. (p13) LDFD f82 = [AO3], LDA
  510. (p14) LDFD f84 = [AO5], LDA
  511. (p15) LDFD f86 = [AO7], LDA
  512. ;;
  513. (p13) LDFD f81 = [AO2], LDA
  514. (p13) LDFD f83 = [AO4], LDA
  515. (p14) LDFD f85 = [AO6], LDA
  516. ;;
  517. (p13) LDFD f88 = [AO1]
  518. (p13) LDFD f90 = [AO3]
  519. (p14) LDFD f92 = [AO5]
  520. (p15) LDFD f94 = [AO7]
  521. ;;
  522. (p13) LDFD f89 = [AO2]
  523. (p13) LDFD f91 = [AO4]
  524. (p14) LDFD f93 = [AO6]
  525. ;;
  526. (p13) LDFD f96 = [X1], 1 * SIZE
  527. (p13) LDFD f98 = [X2], 1 * SIZE
  528. ;;
  529. (p13) LDFD f97 = [X1], 3 * SIZE
  530. (p13) LDFD f99 = [X2], 3 * SIZE
  531. ;;
  532. (p14) LDFD f100 = [X1], 1 * SIZE
  533. ;;
  534. (p14) LDFD f101 = [X1], 1 * SIZE
  535. ;;
  536. (p15) LDFD f102 = [X1], 1 * SIZE
  537. ;;
  538. (p13) FMA f8 = f96, f32, f8
  539. (p13) FMA f9 = f96, f40, f9
  540. (p13) FMA f10 = f96, f48, f10
  541. (p13) FMA f11 = f96, f56, f11
  542. (p13) FMA f12 = f96, f64, f12
  543. (p13) FMA f13 = f96, f72, f13
  544. (p13) FMA f14 = f96, f80, f14
  545. (p13) FMA f15 = f96, f88, f15
  546. ;;
  547. (p13) FMA f8 = f97, f33, f8
  548. (p13) FMA f9 = f97, f41, f9
  549. (p13) FMA f10 = f97, f49, f10
  550. (p13) FMA f11 = f97, f57, f11
  551. (p13) FMA f12 = f97, f65, f12
  552. (p13) FMA f13 = f97, f73, f13
  553. (p13) FMA f14 = f97, f81, f14
  554. (p13) FMA f15 = f97, f89, f15
  555. ;;
  556. (p13) FMA f8 = f98, f34, f8
  557. (p13) FMA f9 = f98, f42, f9
  558. (p13) FMA f10 = f98, f50, f10
  559. (p13) FMA f11 = f98, f58, f11
  560. (p13) FMA f12 = f98, f66, f12
  561. (p13) FMA f13 = f98, f74, f13
  562. (p13) FMA f14 = f98, f82, f14
  563. (p13) FMA f15 = f98, f90, f15
  564. ;;
  565. (p13) FMA f8 = f99, f35, f8
  566. (p13) FMA f9 = f99, f43, f9
  567. (p13) FMA f10 = f99, f51, f10
  568. (p13) FMA f11 = f99, f59, f11
  569. (p13) FMA f12 = f99, f67, f12
  570. (p13) FMA f13 = f99, f75, f13
  571. (p13) FMA f14 = f99, f83, f14
  572. (p13) FMA f15 = f99, f91, f15
  573. ;;
  574. (p14) FMA f8 = f100, f36, f8
  575. (p14) FMA f9 = f100, f44, f9
  576. (p14) FMA f10 = f100, f52, f10
  577. (p14) FMA f11 = f100, f60, f11
  578. (p14) FMA f12 = f100, f68, f12
  579. (p14) FMA f13 = f100, f76, f13
  580. (p14) FMA f14 = f100, f84, f14
  581. (p14) FMA f15 = f100, f92, f15
  582. ;;
  583. (p14) FMA f8 = f101, f37, f8
  584. (p14) FMA f9 = f101, f45, f9
  585. (p14) FMA f10 = f101, f53, f10
  586. (p14) FMA f11 = f101, f61, f11
  587. (p14) FMA f12 = f101, f69, f12
  588. (p14) FMA f13 = f101, f77, f13
  589. (p14) FMA f14 = f101, f85, f14
  590. (p14) FMA f15 = f101, f93, f15
  591. ;;
  592. (p15) FMA f8 = f102, f38, f8
  593. (p15) FMA f9 = f102, f46, f9
  594. (p15) FMA f10 = f102, f54, f10
  595. (p15) FMA f11 = f102, f62, f11
  596. (p15) FMA f12 = f102, f70, f12
  597. (p15) FMA f13 = f102, f78, f13
  598. (p15) FMA f14 = f102, f86, f14
  599. (p15) FMA f15 = f102, f94, f15
  600. ;;
  601. LDFD f32 = [Y1], INCY
  602. ;;
  603. LDFD f33 = [Y1], INCY
  604. ;;
  605. LDFD f34 = [Y1], INCY
  606. ;;
  607. LDFD f35 = [Y1], INCY5
  608. ;;
  609. LDFD f36 = [Y2], INCY
  610. ;;
  611. LDFD f37 = [Y2], INCY
  612. ;;
  613. LDFD f38 = [Y2], INCY
  614. ;;
  615. LDFD f39 = [Y2], INCY5
  616. ;;
  617. FMA f32 = ALPHA, f8, f32
  618. FMA f33 = ALPHA, f9, f33
  619. FMA f34 = ALPHA, f10, f34
  620. FMA f35 = ALPHA, f11, f35
  621. FMA f36 = ALPHA, f12, f36
  622. FMA f37 = ALPHA, f13, f37
  623. FMA f38 = ALPHA, f14, f38
  624. FMA f39 = ALPHA, f15, f39
  625. ;;
  626. STFD [YY1] = f32
  627. add YY1 = YY1, INCY
  628. ;;
  629. STFD [YY1] = f33
  630. add YY1 = YY1, INCY
  631. ;;
  632. STFD [YY1] = f34
  633. add YY1 = YY1, INCY
  634. ;;
  635. STFD [YY1] = f35
  636. add YY1 = YY1, INCY5
  637. ;;
  638. STFD [YY2] = f36
  639. add YY2 = YY2, INCY
  640. ;;
  641. STFD [YY2] = f37
  642. add YY2 = YY2, INCY
  643. ;;
  644. STFD [YY2] = f38
  645. add YY2 = YY2, INCY
  646. ;;
  647. STFD [YY2] = f39
  648. add YY2 = YY2, INCY5
  649. ;;
  650. adds J = -1, J
  651. ;;
  652. cmp.lt p6, p0 = 0, J
  653. (p6) br.cond.dptk .L11
  654. ;;
  655. .align 16
  656. .L20:
  657. tbit.z p6, p0 = N, 2
  658. ;;
  659. (p6) br.cond.dpnt .L30
  660. ;;
  661. mov AO1 = A
  662. adds AO2 = 1 * SIZE, A
  663. adds AO3 = 2 * SIZE, A
  664. adds AO4 = 3 * SIZE, A
  665. adds AO5 = 4 * SIZE, A
  666. adds AO6 = 5 * SIZE, A
  667. adds AO7 = 6 * SIZE, A
  668. adds AO8 = 7 * SIZE, A
  669. shladd A = LDA, 2, A
  670. ;;
  671. shladd LDA7M8 = LDA, 2, r0
  672. ;;
  673. sub LDA7M8 = LDA, LDA7M8
  674. ;;
  675. adds LDA7M8 = 8 * SIZE, LDA7M8
  676. ;;
  677. mov f8 = f0
  678. mov f9 = f0
  679. mov f10 = f0
  680. mov f11 = f0
  681. mov f12 = f0
  682. mov f13 = f0
  683. mov f14 = f0
  684. mov f15 = f0
  685. mov pr.rot= 0
  686. shr I = M, 3
  687. mov ar.ec = 2
  688. ;;
  689. mov X1 = BUFFER
  690. adds X2 = 2 * SIZE, BUFFER
  691. ;;
  692. cmp.eq p16, p0 = r0, r0
  693. ;;
  694. adds I = -1, I
  695. ;;
  696. mov ar.lc = I
  697. cmp.eq p6, p0 = -1, I
  698. (p6) br.cond.dpnt .L25
  699. ;;
  700. .align 16
  701. .L22:
  702. (p16) LDFD f32 = [AO1], LDA
  703. (p16) LDFD f34 = [AO3], LDA
  704. (p16) LDFD f36 = [AO5], LDA
  705. (p16) LDFD f38 = [AO7], LDA
  706. ;;
  707. (p16) LDFD f33 = [AO2], LDA
  708. (p16) LDFD f35 = [AO4], LDA
  709. (p16) LDFD f37 = [AO6], LDA
  710. (p16) LDFD f39 = [AO8], LDA
  711. ;;
  712. (p16) LDFD f40 = [AO1], LDA
  713. (p16) LDFD f42 = [AO3], LDA
  714. (p16) LDFD f44 = [AO5], LDA
  715. (p16) LDFD f46 = [AO7], LDA
  716. ;;
  717. (p16) LDFD f41 = [AO2], LDA
  718. (p16) LDFD f43 = [AO4], LDA
  719. (p16) LDFD f45 = [AO6], LDA
  720. (p16) LDFD f47 = [AO8], LDA
  721. ;;
  722. (p16) LDFD f48 = [AO1], LDA
  723. (p16) LDFD f50 = [AO3], LDA
  724. (p16) LDFD f52 = [AO5], LDA
  725. (p16) LDFD f54 = [AO7], LDA
  726. ;;
  727. (p16) LDFD f49 = [AO2], LDA
  728. (p16) LDFD f51 = [AO4], LDA
  729. (p16) LDFD f53 = [AO6], LDA
  730. (p16) LDFD f55 = [AO8], LDA
  731. ;;
  732. (p16) LDFD f56 = [AO1], LDA7M8
  733. (p16) LDFD f58 = [AO3], LDA7M8
  734. (p16) LDFD f60 = [AO5], LDA7M8
  735. (p16) LDFD f62 = [AO7], LDA7M8
  736. ;;
  737. (p16) LDFD f57 = [AO2], LDA7M8
  738. (p16) LDFD f59 = [AO4], LDA7M8
  739. (p16) LDFD f61 = [AO6], LDA7M8
  740. (p16) LDFD f63 = [AO8], LDA7M8
  741. ;;
  742. (p16) LDFD f96 = [X1], 1 * SIZE
  743. (p16) LDFD f98 = [X2], 1 * SIZE
  744. ;;
  745. (p16) LDFD f97 = [X1], 3 * SIZE
  746. (p16) LDFD f99 = [X2], 3 * SIZE
  747. ;;
  748. (p16) LDFD f100 = [X1], 1 * SIZE
  749. (p16) LDFD f102 = [X2], 1 * SIZE
  750. ;;
  751. (p16) LDFD f101 = [X1], 3 * SIZE
  752. (p16) LDFD f103 = [X2], 3 * SIZE
  753. ;;
  754. (p16) FMA f8 = f96, f32, f8
  755. (p16) FMA f9 = f96, f40, f9
  756. (p16) FMA f10 = f96, f48, f10
  757. (p16) FMA f11 = f96, f56, f11
  758. ;;
  759. (p16) FMA f8 = f97, f33, f8
  760. (p16) FMA f9 = f97, f41, f9
  761. (p16) FMA f10 = f97, f49, f10
  762. (p16) FMA f11 = f97, f57, f11
  763. ;;
  764. (p16) FMA f8 = f98, f34, f8
  765. (p16) FMA f9 = f98, f42, f9
  766. (p16) FMA f10 = f98, f50, f10
  767. (p16) FMA f11 = f98, f58, f11
  768. ;;
  769. (p16) FMA f8 = f99, f35, f8
  770. (p16) FMA f9 = f99, f43, f9
  771. (p16) FMA f10 = f99, f51, f10
  772. (p16) FMA f11 = f99, f59, f11
  773. ;;
  774. (p16) FMA f8 = f100, f36, f8
  775. (p16) FMA f9 = f100, f44, f9
  776. (p16) FMA f10 = f100, f52, f10
  777. (p16) FMA f11 = f100, f60, f11
  778. ;;
  779. (p16) FMA f8 = f101, f37, f8
  780. (p16) FMA f9 = f101, f45, f9
  781. (p16) FMA f10 = f101, f53, f10
  782. (p16) FMA f11 = f101, f61, f11
  783. ;;
  784. (p16) FMA f8 = f102, f38, f8
  785. (p16) FMA f9 = f102, f46, f9
  786. (p16) FMA f10 = f102, f54, f10
  787. (p16) FMA f11 = f102, f62, f11
  788. ;;
  789. (p16) FMA f8 = f103, f39, f8
  790. (p16) FMA f9 = f103, f47, f9
  791. (p16) FMA f10 = f103, f55, f10
  792. (p16) FMA f11 = f103, f63, f11
  793. br.ctop.sptk.few .L22
  794. ;;
  795. .align 16
  796. .L25:
  797. tbit.nz p13, p11 = M, 2
  798. tbit.nz p14, p12 = M, 1
  799. ;;
  800. { .mmi
  801. (p11) adds AO5 = - 4 * SIZE, AO5
  802. }
  803. { .mbb
  804. (p11) adds AO7 = - 4 * SIZE, AO7
  805. }
  806. ;;
  807. { .mmi
  808. (p13) LDFD f32 = [AO1], LDA
  809. (p13) LDFD f34 = [AO3], LDA
  810. tbit.nz p15, p0 = M, 0
  811. }
  812. { .mmi
  813. (p14) LDFD f36 = [AO5], LDA
  814. (p11) adds AO6 = - 4 * SIZE, AO6
  815. (p12) adds AO7 = - 2 * SIZE, AO7
  816. }
  817. ;;
  818. (p13) LDFD f33 = [AO2], LDA
  819. (p13) LDFD f35 = [AO4], LDA
  820. (p14) LDFD f37 = [AO6], LDA
  821. (p15) LDFD f38 = [AO7], LDA
  822. ;;
  823. (p13) LDFD f40 = [AO1], LDA
  824. (p13) LDFD f42 = [AO3], LDA
  825. (p14) LDFD f44 = [AO5], LDA
  826. (p15) LDFD f46 = [AO7], LDA
  827. ;;
  828. (p13) LDFD f41 = [AO2], LDA
  829. (p13) LDFD f43 = [AO4], LDA
  830. (p14) LDFD f45 = [AO6], LDA
  831. ;;
  832. (p13) LDFD f48 = [AO1], LDA
  833. (p13) LDFD f50 = [AO3], LDA
  834. (p14) LDFD f52 = [AO5], LDA
  835. (p15) LDFD f54 = [AO7], LDA
  836. ;;
  837. (p13) LDFD f49 = [AO2], LDA
  838. (p13) LDFD f51 = [AO4], LDA
  839. (p14) LDFD f53 = [AO6], LDA
  840. ;;
  841. (p13) LDFD f56 = [AO1]
  842. (p13) LDFD f58 = [AO3]
  843. (p14) LDFD f60 = [AO5]
  844. (p15) LDFD f62 = [AO7]
  845. ;;
  846. (p13) LDFD f57 = [AO2]
  847. (p13) LDFD f59 = [AO4]
  848. (p14) LDFD f61 = [AO6]
  849. ;;
  850. (p13) LDFD f96 = [X1], 1 * SIZE
  851. (p13) LDFD f98 = [X2], 1 * SIZE
  852. ;;
  853. (p13) LDFD f97 = [X1], 3 * SIZE
  854. (p13) LDFD f99 = [X2], 3 * SIZE
  855. ;;
  856. (p14) LDFD f100 = [X1], 1 * SIZE
  857. ;;
  858. (p14) LDFD f101 = [X1], 1 * SIZE
  859. ;;
  860. (p15) LDFD f102 = [X1], 1 * SIZE
  861. ;;
  862. (p13) FMA f8 = f96, f32, f8
  863. (p13) FMA f9 = f96, f40, f9
  864. (p13) FMA f10 = f96, f48, f10
  865. (p13) FMA f11 = f96, f56, f11
  866. ;;
  867. (p13) FMA f8 = f97, f33, f8
  868. (p13) FMA f9 = f97, f41, f9
  869. (p13) FMA f10 = f97, f49, f10
  870. (p13) FMA f11 = f97, f57, f11
  871. ;;
  872. (p13) FMA f8 = f98, f34, f8
  873. (p13) FMA f9 = f98, f42, f9
  874. (p13) FMA f10 = f98, f50, f10
  875. (p13) FMA f11 = f98, f58, f11
  876. ;;
  877. (p13) FMA f8 = f99, f35, f8
  878. (p13) FMA f9 = f99, f43, f9
  879. (p13) FMA f10 = f99, f51, f10
  880. (p13) FMA f11 = f99, f59, f11
  881. ;;
  882. (p14) FMA f8 = f100, f36, f8
  883. (p14) FMA f9 = f100, f44, f9
  884. (p14) FMA f10 = f100, f52, f10
  885. (p14) FMA f11 = f100, f60, f11
  886. ;;
  887. (p14) FMA f8 = f101, f37, f8
  888. (p14) FMA f9 = f101, f45, f9
  889. (p14) FMA f10 = f101, f53, f10
  890. (p14) FMA f11 = f101, f61, f11
  891. ;;
  892. (p15) FMA f8 = f102, f38, f8
  893. (p15) FMA f9 = f102, f46, f9
  894. (p15) FMA f10 = f102, f54, f10
  895. (p15) FMA f11 = f102, f62, f11
  896. ;;
  897. LDFD f32 = [Y1], INCY
  898. ;;
  899. LDFD f33 = [Y1], INCY
  900. ;;
  901. LDFD f34 = [Y1], INCY
  902. ;;
  903. LDFD f35 = [Y1], INCY
  904. ;;
  905. FMA f32 = ALPHA, f8, f32
  906. FMA f33 = ALPHA, f9, f33
  907. FMA f34 = ALPHA, f10, f34
  908. FMA f35 = ALPHA, f11, f35
  909. ;;
  910. STFD [YY1] = f32
  911. add YY1 = YY1, INCY
  912. ;;
  913. STFD [YY1] = f33
  914. add YY1 = YY1, INCY
  915. ;;
  916. STFD [YY1] = f34
  917. add YY1 = YY1, INCY
  918. ;;
  919. STFD [YY1] = f35
  920. add YY1 = YY1, INCY
  921. ;;
  922. .align 16
  923. .L30:
  924. tbit.z p6, p0 = N, 1
  925. ;;
  926. (p6) br.cond.dpnt .L40
  927. ;;
  928. mov AO1 = A
  929. adds AO2 = 1 * SIZE, A
  930. adds AO3 = 2 * SIZE, A
  931. adds AO4 = 3 * SIZE, A
  932. adds AO5 = 4 * SIZE, A
  933. adds AO6 = 5 * SIZE, A
  934. adds AO7 = 6 * SIZE, A
  935. adds AO8 = 7 * SIZE, A
  936. shladd A = LDA, 1, A
  937. ;;
  938. shladd LDA7M8 = LDA, 1, r0
  939. ;;
  940. sub LDA7M8 = LDA, LDA7M8
  941. ;;
  942. adds LDA7M8 = 8 * SIZE, LDA7M8
  943. ;;
  944. mov f8 = f0
  945. mov f9 = f0
  946. mov f10 = f0
  947. mov f11 = f0
  948. mov f12 = f0
  949. mov f13 = f0
  950. mov f14 = f0
  951. mov f15 = f0
  952. mov pr.rot= 0
  953. shr I = M, 3
  954. mov ar.ec = 2
  955. ;;
  956. mov X1 = BUFFER
  957. adds X2 = 2 * SIZE, BUFFER
  958. ;;
  959. cmp.eq p16, p0 = r0, r0
  960. ;;
  961. adds I = -1, I
  962. ;;
  963. mov ar.lc = I
  964. cmp.eq p6, p0 = -1, I
  965. (p6) br.cond.dpnt .L35
  966. ;;
  967. .align 16
  968. .L32:
  969. (p16) LDFD f32 = [AO1], LDA
  970. (p16) LDFD f34 = [AO3], LDA
  971. (p16) LDFD f36 = [AO5], LDA
  972. (p16) LDFD f38 = [AO7], LDA
  973. ;;
  974. (p16) LDFD f33 = [AO2], LDA
  975. (p16) LDFD f35 = [AO4], LDA
  976. (p16) LDFD f37 = [AO6], LDA
  977. (p16) LDFD f39 = [AO8], LDA
  978. ;;
  979. (p16) LDFD f40 = [AO1], LDA7M8
  980. (p16) LDFD f42 = [AO3], LDA7M8
  981. (p16) LDFD f44 = [AO5], LDA7M8
  982. (p16) LDFD f46 = [AO7], LDA7M8
  983. ;;
  984. (p16) LDFD f41 = [AO2], LDA7M8
  985. (p16) LDFD f43 = [AO4], LDA7M8
  986. (p16) LDFD f45 = [AO6], LDA7M8
  987. (p16) LDFD f47 = [AO8], LDA7M8
  988. ;;
  989. (p16) LDFD f96 = [X1], 1 * SIZE
  990. (p16) LDFD f98 = [X2], 1 * SIZE
  991. ;;
  992. (p16) LDFD f97 = [X1], 3 * SIZE
  993. (p16) LDFD f99 = [X2], 3 * SIZE
  994. ;;
  995. (p16) LDFD f100 = [X1], 1 * SIZE
  996. (p16) LDFD f102 = [X2], 1 * SIZE
  997. ;;
  998. (p16) LDFD f101 = [X1], 3 * SIZE
  999. (p16) LDFD f103 = [X2], 3 * SIZE
  1000. ;;
  1001. (p16) FMA f8 = f96, f32, f8
  1002. (p16) FMA f9 = f96, f40, f9
  1003. ;;
  1004. (p16) FMA f8 = f97, f33, f8
  1005. (p16) FMA f9 = f97, f41, f9
  1006. ;;
  1007. (p16) FMA f8 = f98, f34, f8
  1008. (p16) FMA f9 = f98, f42, f9
  1009. ;;
  1010. (p16) FMA f8 = f99, f35, f8
  1011. (p16) FMA f9 = f99, f43, f9
  1012. ;;
  1013. (p16) FMA f8 = f100, f36, f8
  1014. (p16) FMA f9 = f100, f44, f9
  1015. ;;
  1016. (p16) FMA f8 = f101, f37, f8
  1017. (p16) FMA f9 = f101, f45, f9
  1018. ;;
  1019. (p16) FMA f8 = f102, f38, f8
  1020. (p16) FMA f9 = f102, f46, f9
  1021. ;;
  1022. (p16) FMA f8 = f103, f39, f8
  1023. (p16) FMA f9 = f103, f47, f9
  1024. br.ctop.sptk.few .L32
  1025. ;;
  1026. .align 16
  1027. .L35:
  1028. tbit.nz p13, p11 = M, 2
  1029. tbit.nz p14, p12 = M, 1
  1030. ;;
  1031. { .mmi
  1032. (p11) adds AO5 = - 4 * SIZE, AO5
  1033. }
  1034. { .mbb
  1035. (p11) adds AO7 = - 4 * SIZE, AO7
  1036. }
  1037. ;;
  1038. { .mmi
  1039. (p13) LDFD f32 = [AO1], LDA
  1040. (p13) LDFD f34 = [AO3], LDA
  1041. tbit.nz p15, p0 = M, 0
  1042. }
  1043. { .mmi
  1044. (p14) LDFD f36 = [AO5], LDA
  1045. (p11) adds AO6 = - 4 * SIZE, AO6
  1046. (p12) adds AO7 = - 2 * SIZE, AO7
  1047. }
  1048. ;;
  1049. (p13) LDFD f33 = [AO2], LDA
  1050. (p13) LDFD f35 = [AO4], LDA
  1051. (p14) LDFD f37 = [AO6], LDA
  1052. (p15) LDFD f38 = [AO7], LDA
  1053. ;;
  1054. (p13) LDFD f40 = [AO1]
  1055. (p13) LDFD f42 = [AO3]
  1056. (p14) LDFD f44 = [AO5]
  1057. (p15) LDFD f46 = [AO7]
  1058. ;;
  1059. (p13) LDFD f41 = [AO2]
  1060. (p13) LDFD f43 = [AO4]
  1061. (p14) LDFD f45 = [AO6]
  1062. ;;
  1063. (p13) LDFD f96 = [X1], 1 * SIZE
  1064. (p13) LDFD f98 = [X2], 1 * SIZE
  1065. ;;
  1066. (p13) LDFD f97 = [X1], 3 * SIZE
  1067. (p13) LDFD f99 = [X2], 3 * SIZE
  1068. ;;
  1069. (p14) LDFD f100 = [X1], 1 * SIZE
  1070. ;;
  1071. (p14) LDFD f101 = [X1], 1 * SIZE
  1072. ;;
  1073. (p15) LDFD f102 = [X1], 1 * SIZE
  1074. ;;
  1075. (p13) FMA f8 = f96, f32, f8
  1076. (p13) FMA f9 = f96, f40, f9
  1077. ;;
  1078. (p13) FMA f8 = f97, f33, f8
  1079. (p13) FMA f9 = f97, f41, f9
  1080. ;;
  1081. (p13) FMA f8 = f98, f34, f8
  1082. (p13) FMA f9 = f98, f42, f9
  1083. ;;
  1084. (p13) FMA f8 = f99, f35, f8
  1085. (p13) FMA f9 = f99, f43, f9
  1086. ;;
  1087. (p14) FMA f8 = f100, f36, f8
  1088. (p14) FMA f9 = f100, f44, f9
  1089. ;;
  1090. (p14) FMA f8 = f101, f37, f8
  1091. (p14) FMA f9 = f101, f45, f9
  1092. ;;
  1093. (p15) FMA f8 = f102, f38, f8
  1094. (p15) FMA f9 = f102, f46, f9
  1095. ;;
  1096. LDFD f32 = [Y1], INCY
  1097. ;;
  1098. LDFD f33 = [Y1], INCY
  1099. ;;
  1100. FMA f32 = ALPHA, f8, f32
  1101. FMA f33 = ALPHA, f9, f33
  1102. ;;
  1103. STFD [YY1] = f32
  1104. add YY1 = YY1, INCY
  1105. ;;
  1106. STFD [YY1] = f33
  1107. add YY1 = YY1, INCY
  1108. ;;
  1109. .align 16
  1110. .L40:
  1111. tbit.z p6, p0 = N, 0
  1112. ;;
  1113. (p6) br.cond.dpnt .L999
  1114. ;;
  1115. mov AO1 = A
  1116. adds AO2 = 1 * SIZE, A
  1117. adds AO3 = 2 * SIZE, A
  1118. adds AO4 = 3 * SIZE, A
  1119. adds AO5 = 4 * SIZE, A
  1120. adds AO6 = 5 * SIZE, A
  1121. adds AO7 = 6 * SIZE, A
  1122. adds AO8 = 7 * SIZE, A
  1123. add A = LDA, A
  1124. ;;
  1125. mov f8 = f0
  1126. mov f9 = f0
  1127. mov f10 = f0
  1128. mov f11 = f0
  1129. mov f12 = f0
  1130. mov f13 = f0
  1131. mov f14 = f0
  1132. mov f15 = f0
  1133. mov pr.rot= 0
  1134. shr I = M, 3
  1135. mov ar.ec = 2
  1136. ;;
  1137. mov X1 = BUFFER
  1138. adds X2 = 2 * SIZE, BUFFER
  1139. ;;
  1140. cmp.eq p16, p0 = r0, r0
  1141. ;;
  1142. adds I = -1, I
  1143. ;;
  1144. mov ar.lc = I
  1145. cmp.eq p6, p0 = -1, I
  1146. (p6) br.cond.dpnt .L45
  1147. ;;
  1148. .align 16
  1149. .L42:
  1150. (p16) LDFD f32 = [AO1], 8 * SIZE
  1151. (p16) LDFD f34 = [AO3], 8 * SIZE
  1152. (p16) LDFD f36 = [AO5], 8 * SIZE
  1153. (p16) LDFD f38 = [AO7], 8 * SIZE
  1154. ;;
  1155. (p16) LDFD f33 = [AO2], 8 * SIZE
  1156. (p16) LDFD f35 = [AO4], 8 * SIZE
  1157. (p16) LDFD f37 = [AO6], 8 * SIZE
  1158. (p16) LDFD f39 = [AO8], 8 * SIZE
  1159. ;;
  1160. (p16) LDFD f96 = [X1], 1 * SIZE
  1161. (p16) LDFD f98 = [X2], 1 * SIZE
  1162. ;;
  1163. (p16) LDFD f97 = [X1], 3 * SIZE
  1164. (p16) LDFD f99 = [X2], 3 * SIZE
  1165. ;;
  1166. (p16) LDFD f100 = [X1], 1 * SIZE
  1167. (p16) LDFD f102 = [X2], 1 * SIZE
  1168. ;;
  1169. (p16) LDFD f101 = [X1], 3 * SIZE
  1170. (p16) LDFD f103 = [X2], 3 * SIZE
  1171. ;;
  1172. (p16) FMA f8 = f96, f32, f8
  1173. ;;
  1174. (p16) FMA f8 = f97, f33, f8
  1175. ;;
  1176. (p16) FMA f8 = f98, f34, f8
  1177. ;;
  1178. (p16) FMA f8 = f99, f35, f8
  1179. ;;
  1180. (p16) FMA f8 = f100, f36, f8
  1181. ;;
  1182. (p16) FMA f8 = f101, f37, f8
  1183. ;;
  1184. (p16) FMA f8 = f102, f38, f8
  1185. ;;
  1186. (p16) FMA f8 = f103, f39, f8
  1187. br.ctop.sptk.few .L42
  1188. ;;
  1189. .align 16
  1190. .L45:
  1191. tbit.nz p13, p11 = M, 2
  1192. tbit.nz p14, p12 = M, 1
  1193. ;;
  1194. { .mmi
  1195. (p11) adds AO5 = - 4 * SIZE, AO5
  1196. }
  1197. { .mbb
  1198. (p11) adds AO7 = - 4 * SIZE, AO7
  1199. }
  1200. ;;
  1201. { .mmi
  1202. (p13) LDFD f32 = [AO1]
  1203. (p13) LDFD f34 = [AO3]
  1204. tbit.nz p15, p0 = M, 0
  1205. }
  1206. { .mmi
  1207. (p14) LDFD f36 = [AO5]
  1208. (p11) adds AO6 = - 4 * SIZE, AO6
  1209. (p12) adds AO7 = - 2 * SIZE, AO7
  1210. }
  1211. ;;
  1212. (p13) LDFD f33 = [AO2]
  1213. (p13) LDFD f35 = [AO4]
  1214. (p14) LDFD f37 = [AO6]
  1215. (p15) LDFD f38 = [AO7]
  1216. ;;
  1217. (p13) LDFD f96 = [X1], 1 * SIZE
  1218. (p13) LDFD f98 = [X2], 1 * SIZE
  1219. ;;
  1220. (p13) LDFD f97 = [X1], 3 * SIZE
  1221. (p13) LDFD f99 = [X2], 3 * SIZE
  1222. ;;
  1223. (p14) LDFD f100 = [X1], 1 * SIZE
  1224. ;;
  1225. (p14) LDFD f101 = [X1], 1 * SIZE
  1226. ;;
  1227. (p15) LDFD f102 = [X1], 1 * SIZE
  1228. ;;
  1229. (p13) FMA f8 = f96, f32, f8
  1230. ;;
  1231. (p13) FMA f8 = f97, f33, f8
  1232. ;;
  1233. (p13) FMA f8 = f98, f34, f8
  1234. ;;
  1235. (p13) FMA f8 = f99, f35, f8
  1236. ;;
  1237. (p14) FMA f8 = f100, f36, f8
  1238. ;;
  1239. (p14) FMA f8 = f101, f37, f8
  1240. ;;
  1241. (p15) FMA f8 = f102, f38, f8
  1242. ;;
  1243. LDFD f32 = [Y1], INCY
  1244. ;;
  1245. FMA f32 = ALPHA, f8, f32
  1246. ;;
  1247. STFD [YY1] = f32
  1248. .align 16
  1249. .L999:
  1250. mov ar.lc = ARLC
  1251. mov pr = PR, -1
  1252. br.ret.sptk.many b0
  1253. ;;
  1254. EPILOGUE