You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy.S 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef XDOUBLE
  41. #define PREFETCH_SIZE ( 8 * 16)
  42. #elif defined(DOUBLE)
  43. #define PREFETCH_SIZE (16 * 16)
  44. #else
  45. #define PREFETCH_SIZE (32 * 16)
  46. #endif
  47. #ifndef CONJ
  48. #define FMA1 FNMA
  49. #define FMA2 FMA
  50. #else
  51. #define FMA1 FMA
  52. #define FMA2 FNMA
  53. #endif
  54. #define SP r12
  55. #ifdef XDOUBLE
  56. #define N r32
  57. #define X1 r14
  58. #define INCX r15
  59. #define Y1 r16
  60. #define INCY r17
  61. #else
  62. #define N r32
  63. #define X1 r37
  64. #define INCX r38
  65. #define Y1 r39
  66. #define INCY r36
  67. #endif
  68. #define PREX1 r2
  69. #define PREY1 r3
  70. #define I r18
  71. #define J r19
  72. #define Y2 r20
  73. #define X2 r21
  74. #define INCX8 r22
  75. #define INCY8 r23
  76. #define YY1 r24
  77. #define YY2 r25
  78. #define YY3 r26
  79. #define YY4 r27
  80. #define INCX2M1 loc0
  81. #define INCY2M1 loc1
  82. #define INCX4M1 loc2
  83. #define INCY4M1 loc3
  84. #define X3 loc4
  85. #define Y3 loc5
  86. #define X4 loc6
  87. #define Y4 loc7
  88. #define PREX2 loc8
  89. #define PREY2 loc9
  90. #define ARLC r29
  91. #define PR r30
  92. #define ALPHA_R f8
  93. #define ALPHA_I f9
  94. PROLOGUE
  95. .prologue
  96. PROFCODE
  97. { .mmi
  98. adds r14 = 16, SP
  99. adds r15 = 24, SP
  100. adds r16 = 32, SP
  101. }
  102. { .mmb
  103. adds r17 = 40, SP
  104. cmp.gt p15, p0 = r0, N
  105. (p15) br.ret.sptk.many b0
  106. }
  107. ;;
  108. #ifdef XDOUBLE
  109. { .mmi
  110. ld8 X1 = [r14]
  111. ld8 INCX = [r15]
  112. nop __LINE__
  113. }
  114. { .mmi
  115. ld8 Y1 = [r16]
  116. ld8 INCY = [r17]
  117. nop __LINE__
  118. }
  119. ;;
  120. #else
  121. { .mmi
  122. ld8 INCY = [r14]
  123. nop __LINE__
  124. nop __LINE__
  125. }
  126. ;;
  127. #endif
  128. { .mmi
  129. .save ar.pfs, r10
  130. alloc r10 = ar.pfs, 8, 16, 0, 0
  131. and J = 7, N
  132. shl INCX = INCX, ZBASE_SHIFT
  133. }
  134. { .mmi
  135. adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1
  136. adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1
  137. shl INCY = INCY, ZBASE_SHIFT
  138. }
  139. ;;
  140. { .mmi
  141. shladd INCX8 = INCX, 3, r0
  142. shladd INCY8 = INCY, 3, r0
  143. .save ar.lc, ARLC
  144. mov ARLC = ar.lc
  145. }
  146. { .mmi
  147. adds INCX2M1 = -SIZE, INCX
  148. adds INCY2M1 = -SIZE, INCY
  149. shr I = N, 3
  150. }
  151. ;;
  152. { .mmi
  153. add INCX2M1 = INCX2M1, INCX
  154. add INCY2M1 = INCY2M1, INCY
  155. mov PR = pr
  156. }
  157. { .mmi
  158. add X2 = X1, INCX
  159. add Y2 = Y1, INCY
  160. nop __LINE__
  161. }
  162. ;;
  163. { .mmi
  164. shladd INCX4M1 = INCX, 1, INCX2M1
  165. shladd INCY4M1 = INCY, 1, INCY2M1
  166. mov pr.rot= 0
  167. }
  168. { .mmi
  169. shladd X3 = INCX, 1, X1
  170. shladd Y3 = INCY, 1, Y1
  171. }
  172. ;;
  173. { .mmi
  174. shladd X4 = INCX, 1, X2
  175. shladd Y4 = INCY, 1, Y2
  176. adds I = -1, I
  177. }
  178. { .mmi
  179. cmp.eq p16, p0 = r0, r0
  180. and r8 = 127, Y1
  181. and PREX1 = -128, PREX1
  182. }
  183. ;;
  184. { .mmi
  185. mov YY1 = Y1
  186. mov YY2 = Y2
  187. mov ar.ec = 3
  188. }
  189. { .mmi
  190. mov YY3 = Y3
  191. mov YY4 = Y4
  192. or PREX1 = PREX1, r8
  193. }
  194. ;;
  195. { .mmi
  196. shladd PREX2 = INCX, 2, PREX1
  197. shladd PREY2 = INCY, 2, PREY1
  198. mov ar.lc = I
  199. }
  200. { .mib
  201. cmp.eq p11 ,p0 = -1, I
  202. tbit.z p0, p13 = N, 2
  203. (p11) br.cond.dpnt .L25
  204. }
  205. ;;
  206. .align 32
  207. .L22:
  208. #ifdef XDOUBLE
  209. { .mmf
  210. (p16) LDFD f80 = [Y1], 1 * SIZE
  211. (p16) LDFD f83 = [Y2], 1 * SIZE
  212. (p18) FMA1 f82 = ALPHA_I, f40, f82
  213. }
  214. { .mmf
  215. (p16) LDFD f92 = [Y3], 1 * SIZE
  216. (p16) LDFD f95 = [Y4], 1 * SIZE
  217. (p18) FMA1 f85 = ALPHA_I, f43, f85
  218. }
  219. ;;
  220. { .mmf
  221. (p16) LDFD f86 = [Y1], INCY4M1
  222. (p16) LDFD f89 = [Y2], INCY4M1
  223. (p18) FMA1 f94 = ALPHA_I, f52, f94
  224. }
  225. { .mmf
  226. (p16) LDFD f98 = [Y3], INCY4M1
  227. (p16) LDFD f101 = [Y4], INCY4M1
  228. (p18) FMA1 f97 = ALPHA_I, f55, f97
  229. }
  230. ;;
  231. { .mmf
  232. (p16) LDFD f32 = [X1], 1 * SIZE
  233. (p16) LDFD f35 = [X2], 1 * SIZE
  234. (p18) FMA f88 = ALPHA_I, f34, f88
  235. }
  236. { .mmf
  237. (p16) LDFD f44 = [X3], 1 * SIZE
  238. (p16) LDFD f47 = [X4], 1 * SIZE
  239. (p18) FMA f91 = ALPHA_I, f37, f91
  240. }
  241. ;;
  242. { .mmf
  243. (p16) LDFD f38 = [X1], INCX4M1
  244. (p16) LDFD f41 = [X2], INCX4M1
  245. (p18) FMA f100 = ALPHA_I, f46, f100
  246. }
  247. { .mmf
  248. (p16) LDFD f50 = [X3], INCX4M1
  249. (p16) LDFD f53 = [X4], INCX4M1
  250. (p18) FMA f103 = ALPHA_I, f49, f103
  251. }
  252. ;;
  253. { .mmf
  254. (p18) STFD [YY1] = f82, 1 * SIZE
  255. (p18) STFD [YY2] = f85, 1 * SIZE
  256. (p18) FMA f106 = ALPHA_R, f58, f106
  257. }
  258. { .mmf
  259. (p19) add YY3 = YY3, INCY4M1
  260. (p19) add YY4 = YY4, INCY4M1
  261. (p18) FMA f109 = ALPHA_R, f61, f109
  262. }
  263. ;;
  264. { .mmf
  265. (p18) STFD [YY3] = f94, 1 * SIZE
  266. (p18) STFD [YY4] = f97, 1 * SIZE
  267. (p18) FMA f118 = ALPHA_R, f70, f118
  268. }
  269. { .mmf
  270. (p16) lfetch.excl.nt1 [PREY1], INCY8
  271. (p16) lfetch.excl.nt1 [PREY2], INCY8
  272. (p18) FMA f121 = ALPHA_R, f73, f121
  273. }
  274. ;;
  275. { .mmf
  276. (p18) STFD [YY1] = f88
  277. (p18) STFD [YY2] = f91
  278. (p18) FMA2 f112 = ALPHA_R, f64, f112
  279. }
  280. { .mmf
  281. (p18) add YY1 = YY1, INCY4M1
  282. (p18) add YY2 = YY2, INCY4M1
  283. (p18) FMA2 f115 = ALPHA_R, f67, f115
  284. }
  285. ;;
  286. { .mmf
  287. (p18) STFD [YY3] = f100
  288. (p18) STFD [YY4] = f103
  289. (p18) FMA2 f124 = ALPHA_R, f76, f124
  290. }
  291. { .mmf
  292. (p18) add YY3 = YY3, INCY4M1
  293. (p18) add YY4 = YY4, INCY4M1
  294. (p18) FMA2 f127 = ALPHA_R, f79, f127
  295. }
  296. ;;
  297. { .mmf
  298. (p16) LDFD f104 = [Y1], 1 * SIZE
  299. (p16) LDFD f107 = [Y2], 1 * SIZE
  300. (p18) FMA1 f106 = ALPHA_I, f64, f106
  301. }
  302. { .mmf
  303. (p16) LDFD f116 = [Y3], 1 * SIZE
  304. (p16) LDFD f119 = [Y4], 1 * SIZE
  305. (p18) FMA1 f109 = ALPHA_I, f67, f109
  306. }
  307. ;;
  308. { .mmf
  309. (p16) LDFD f110 = [Y1], INCY4M1
  310. (p16) LDFD f113 = [Y2], INCY4M1
  311. (p18) FMA1 f118 = ALPHA_I, f76, f118
  312. }
  313. { .mmf
  314. (p16) LDFD f122 = [Y3], INCY4M1
  315. (p16) LDFD f125 = [Y4], INCY4M1
  316. (p18) FMA1 f121 = ALPHA_I, f79, f121
  317. }
  318. ;;
  319. { .mmf
  320. (p16) LDFD f56 = [X1], 1 * SIZE
  321. (p16) LDFD f59 = [X2], 1 * SIZE
  322. (p18) FMA f112 = ALPHA_I, f58, f112
  323. }
  324. { .mmf
  325. (p16) LDFD f68 = [X3], 1 * SIZE
  326. (p16) LDFD f71 = [X4], 1 * SIZE
  327. (p18) FMA f115 = ALPHA_I, f61, f115
  328. }
  329. ;;
  330. { .mmf
  331. (p16) LDFD f62 = [X1], INCX4M1
  332. (p16) LDFD f65 = [X2], INCX4M1
  333. (p18) FMA f124 = ALPHA_I, f70, f124
  334. }
  335. { .mmf
  336. (p16) LDFD f74 = [X3], INCX4M1
  337. (p16) LDFD f77 = [X4], INCX4M1
  338. (p18) FMA f127 = ALPHA_I, f73, f127
  339. }
  340. ;;
  341. { .mmf
  342. (p18) STFD [YY1] = f106, 1 * SIZE
  343. (p18) STFD [YY2] = f109, 1 * SIZE
  344. (p17) FMA f81 = ALPHA_R, f33, f81
  345. }
  346. { .mmf
  347. nop __LINE__
  348. nop __LINE__
  349. (p17) FMA f84 = ALPHA_R, f36, f84
  350. }
  351. ;;
  352. { .mmf
  353. (p18) STFD [YY3] = f118, 1 * SIZE
  354. (p18) STFD [YY4] = f121, 1 * SIZE
  355. (p17) FMA f93 = ALPHA_R, f45, f93
  356. }
  357. { .mmf
  358. (p16) lfetch.nt1 [PREX1], INCX8
  359. (p16) lfetch.nt1 [PREX2], INCX8
  360. (p17) FMA f96 = ALPHA_R, f48, f96
  361. }
  362. ;;
  363. { .mmf
  364. (p18) STFD [YY1] = f112
  365. (p18) STFD [YY2] = f115
  366. (p17) FMA2 f87 = ALPHA_R, f39, f87
  367. }
  368. { .mmf
  369. (p18) add YY1 = YY1, INCY4M1
  370. (p18) add YY2 = YY2, INCY4M1
  371. (p17) FMA2 f90 = ALPHA_R, f42, f90
  372. }
  373. ;;
  374. { .mmf
  375. (p18) STFD [YY3] = f124
  376. (p18) STFD [YY4] = f127
  377. (p17) FMA2 f99 = ALPHA_R, f51, f99
  378. }
  379. { .mfb
  380. nop __LINE__
  381. (p17) FMA2 f102 = ALPHA_R, f54, f102
  382. br.ctop.sptk.few .L22
  383. }
  384. ;;
  385. ;;
  386. (p19) add YY3 = YY3, INCY4M1
  387. (p19) add YY4 = YY4, INCY4M1
  388. ;;
  389. #else
  390. { .mmf
  391. (p19) STFD [YY3] = f125
  392. (p19) STFD [YY4] = f32
  393. (p18) FMA2 f100 = ALPHA_R, f52, f100
  394. }
  395. { .mmf
  396. (p16) lfetch.excl.nt1 [PREY1], INCY8
  397. nop __LINE__
  398. (p18) FMA2 f103 = ALPHA_R, f55, f103
  399. }
  400. ;;
  401. { .mmf
  402. (p16) LDFD f80 = [Y1], 1 * SIZE
  403. (p16) LDFD f83 = [Y2], 1 * SIZE
  404. (p18) FMA1 f82 = ALPHA_I, f40, f82
  405. }
  406. { .mmf
  407. (p16) LDFD f92 = [Y3], 1 * SIZE
  408. (p16) LDFD f95 = [Y4], 1 * SIZE
  409. (p18) FMA1 f85 = ALPHA_I, f43, f85
  410. }
  411. ;;
  412. { .mmf
  413. (p16) LDFD f86 = [Y1], INCY4M1
  414. (p16) LDFD f89 = [Y2], INCY4M1
  415. (p18) FMA1 f94 = ALPHA_I, f52, f94
  416. }
  417. { .mmf
  418. (p19) add YY3 = YY3, INCY4M1
  419. (p19) add YY4 = YY4, INCY4M1
  420. (p18) FMA1 f97 = ALPHA_I, f55, f97
  421. }
  422. ;;
  423. { .mmf
  424. (p16) LDFD f98 = [Y3], INCY4M1
  425. (p16) LDFD f101 = [Y4], INCY4M1
  426. (p18) FMA f88 = ALPHA_I, f34, f88
  427. }
  428. { .mmf
  429. (p19) add YY1 = YY1, INCY4M1
  430. (p19) add YY2 = YY2, INCY4M1
  431. (p18) FMA f91 = ALPHA_I, f37, f91
  432. }
  433. ;;
  434. { .mmf
  435. (p16) LDFD f32 = [X1], 1 * SIZE
  436. (p16) LDFD f35 = [X2], 1 * SIZE
  437. (p18) FMA f100 = ALPHA_I, f46, f100
  438. }
  439. { .mmf
  440. (p16) LDFD f44 = [X3], 1 * SIZE
  441. (p16) LDFD f47 = [X4], 1 * SIZE
  442. (p18) FMA f103 = ALPHA_I, f49, f103
  443. }
  444. ;;
  445. { .mmf
  446. (p18) STFD [YY1] = f82, 1 * SIZE
  447. (p18) STFD [YY2] = f85, 1 * SIZE
  448. (p18) FMA f106 = ALPHA_R, f58, f106
  449. }
  450. { .mmf
  451. (p16) LDFD f38 = [X1], INCX4M1
  452. (p16) LDFD f41 = [X2], INCX4M1
  453. (p18) FMA f109 = ALPHA_R, f61, f109
  454. }
  455. ;;
  456. { .mmf
  457. (p18) STFD [YY3] = f94, 1 * SIZE
  458. (p18) STFD [YY4] = f97, 1 * SIZE
  459. (p18) FMA f118 = ALPHA_R, f70, f118
  460. }
  461. { .mmf
  462. (p16) LDFD f50 = [X3], INCX4M1
  463. (p16) LDFD f53 = [X4], INCX4M1
  464. (p18) FMA f121 = ALPHA_R, f73, f121
  465. }
  466. ;;
  467. { .mmf
  468. (p18) STFD [YY1] = f88
  469. (p18) STFD [YY2] = f91
  470. (p18) FMA2 f112 = ALPHA_R, f64, f112
  471. }
  472. { .mmf
  473. (p16) lfetch.nt1 [PREX1], INCX8
  474. nop __LINE__
  475. (p18) FMA2 f115 = ALPHA_R, f67, f115
  476. }
  477. ;;
  478. { .mmf
  479. (p18) STFD [YY3] = f100
  480. (p18) STFD [YY4] = f103
  481. (p18) FMA2 f124 = ALPHA_R, f76, f124
  482. }
  483. { .mmf
  484. (p16) LDFD f104 = [Y1], 1 * SIZE
  485. (p16) LDFD f107 = [Y2], 1 * SIZE
  486. (p18) FMA2 f127 = ALPHA_R, f79, f127
  487. }
  488. ;;
  489. { .mmf
  490. (p16) LDFD f116 = [Y3], 1 * SIZE
  491. (p16) LDFD f119 = [Y4], 1 * SIZE
  492. (p18) FMA1 f106 = ALPHA_I, f64, f106
  493. }
  494. { .mmf
  495. (p18) add YY1 = YY1, INCY4M1
  496. (p18) add YY2 = YY2, INCY4M1
  497. (p18) FMA1 f109 = ALPHA_I, f67, f109
  498. }
  499. ;;
  500. { .mmf
  501. (p16) LDFD f110 = [Y1], INCY4M1
  502. (p16) LDFD f113 = [Y2], INCY4M1
  503. (p18) FMA1 f118 = ALPHA_I, f76, f118
  504. }
  505. { .mmf
  506. (p18) add YY3 = YY3, INCY4M1
  507. (p18) add YY4 = YY4, INCY4M1
  508. (p18) FMA1 f121 = ALPHA_I, f79, f121
  509. }
  510. ;;
  511. { .mmf
  512. (p16) LDFD f122 = [Y3], INCY4M1
  513. (p16) LDFD f125 = [Y4], INCY4M1
  514. (p18) FMA f112 = ALPHA_I, f58, f112
  515. }
  516. { .mmf
  517. nop __LINE__
  518. nop __LINE__
  519. (p18) FMA f115 = ALPHA_I, f61, f115
  520. }
  521. ;;
  522. { .mmf
  523. (p16) LDFD f56 = [X1], 1 * SIZE
  524. (p16) LDFD f59 = [X2], 1 * SIZE
  525. (p18) FMA f124 = ALPHA_I, f70, f124
  526. }
  527. { .mmf
  528. (p16) LDFD f68 = [X3], 1 * SIZE
  529. (p16) LDFD f71 = [X4], 1 * SIZE
  530. (p18) FMA f127 = ALPHA_I, f73, f127
  531. }
  532. ;;
  533. { .mmf
  534. (p18) STFD [YY1] = f106, 1 * SIZE
  535. (p18) STFD [YY2] = f109, 1 * SIZE
  536. (p17) FMA f81 = ALPHA_R, f33, f81
  537. }
  538. { .mmf
  539. (p16) LDFD f62 = [X1], INCX4M1
  540. (p16) LDFD f65 = [X2], INCX4M1
  541. (p17) FMA f84 = ALPHA_R, f36, f84
  542. }
  543. ;;
  544. { .mmf
  545. (p18) STFD [YY3] = f118, 1 * SIZE
  546. (p18) STFD [YY4] = f121, 1 * SIZE
  547. (p17) FMA f93 = ALPHA_R, f45, f93
  548. }
  549. { .mmf
  550. (p16) LDFD f74 = [X3], INCX4M1
  551. (p16) LDFD f77 = [X4], INCX4M1
  552. (p17) FMA f96 = ALPHA_R, f48, f96
  553. }
  554. ;;
  555. { .mmf
  556. (p18) STFD [YY1] = f112
  557. (p18) STFD [YY2] = f115
  558. (p17) FMA2 f87 = ALPHA_R, f39, f87
  559. }
  560. { .mfb
  561. nop __LINE__
  562. (p17) FMA2 f90 = ALPHA_R, f42, f90
  563. br.ctop.sptk.few .L22
  564. }
  565. ;;
  566. { .mmi
  567. (p19) STFD [YY3] = f125
  568. (p19) STFD [YY4] = f32
  569. (p19) add YY1 = YY1, INCY4M1
  570. }
  571. { .mmi
  572. (p19) add YY2 = YY2, INCY4M1
  573. (p19) add YY3 = YY3, INCY4M1
  574. (p19) add YY4 = YY4, INCY4M1
  575. }
  576. ;;
  577. #endif
  578. .align 32
  579. .L25:
  580. { .mmi
  581. (p13) LDFD f32 = [X1], 1 * SIZE
  582. (p13) LDFD f34 = [X2], 1 * SIZE
  583. mov ar.lc = ARLC
  584. }
  585. { .mmi
  586. (p13) LDFD f36 = [X3], 1 * SIZE
  587. (p13) LDFD f38 = [X4], 1 * SIZE
  588. cmp.eq p12, p0 = r0, J
  589. }
  590. ;;
  591. { .mmi
  592. (p13) LDFD f80 = [Y1], 1 * SIZE
  593. (p13) LDFD f82 = [Y2], 1 * SIZE
  594. mov pr = PR, -65474
  595. }
  596. { .mmb
  597. (p13) LDFD f84 = [Y3], 1 * SIZE
  598. (p13) LDFD f86 = [Y4], 1 * SIZE
  599. (p12) br.ret.sptk.many b0
  600. }
  601. ;;
  602. { .mmi
  603. (p13) LDFD f33 = [X1], INCX4M1
  604. (p13) LDFD f35 = [X2], INCX4M1
  605. tbit.z p0, p14 = N, 1
  606. }
  607. { .mmi
  608. (p13) LDFD f81 = [Y1], INCY4M1
  609. (p13) LDFD f83 = [Y2], INCY4M1
  610. nop __LINE__
  611. }
  612. ;;
  613. { .mmi
  614. (p13) LDFD f37 = [X3], INCX4M1
  615. (p13) LDFD f39 = [X4], INCX4M1
  616. tbit.z p0, p15 = N, 0
  617. }
  618. { .mmi
  619. (p13) LDFD f85 = [Y3], INCY4M1
  620. (p13) LDFD f87 = [Y4], INCY4M1
  621. nop __LINE__
  622. }
  623. ;;
  624. { .mmf
  625. (p14) LDFD f40 = [X1], 1 * SIZE
  626. (p14) LDFD f42 = [X2], 1 * SIZE
  627. }
  628. ;;
  629. { .mmf
  630. (p14) LDFD f88 = [Y1], 1 * SIZE
  631. (p14) LDFD f90 = [Y2], 1 * SIZE
  632. }
  633. ;;
  634. { .mmf
  635. (p14) LDFD f41 = [X1], INCX2M1
  636. (p14) LDFD f43 = [X2], INCX2M1
  637. (p13) FMA f80 = ALPHA_R, f32, f80
  638. }
  639. { .mmf
  640. nop __LINE__
  641. nop __LINE__
  642. (p13) FMA f82 = ALPHA_R, f34, f82
  643. }
  644. ;;
  645. { .mmf
  646. (p14) LDFD f89 = [Y1], INCY2M1
  647. (p14) LDFD f91 = [Y2], INCY2M1
  648. (p13) FMA f84 = ALPHA_R, f36, f84
  649. }
  650. { .mmf
  651. nop __LINE__
  652. nop __LINE__
  653. (p13) FMA f86 = ALPHA_R, f38, f86
  654. }
  655. ;;
  656. { .mmf
  657. (p15) LDFD f44 = [X1], 1 * SIZE
  658. (p15) LDFD f92 = [Y1], 1 * SIZE
  659. (p13) FMA2 f81 = ALPHA_R, f33, f81
  660. }
  661. { .mmf
  662. nop __LINE__
  663. nop __LINE__
  664. (p13) FMA2 f83 = ALPHA_R, f35, f83
  665. }
  666. ;;
  667. { .mmf
  668. (p15) LDFD f45 = [X1]
  669. (p15) LDFD f93 = [Y1]
  670. (p13) FMA2 f85 = ALPHA_R, f37, f85
  671. }
  672. { .mmf
  673. nop __LINE__
  674. nop __LINE__
  675. (p13) FMA2 f87 = ALPHA_R, f39, f87
  676. }
  677. ;;
  678. { .mmf
  679. nop __LINE__
  680. nop __LINE__
  681. (p13) FMA1 f80 = ALPHA_I, f33, f80
  682. }
  683. { .mmf
  684. nop __LINE__
  685. nop __LINE__
  686. (p13) FMA1 f82 = ALPHA_I, f35, f82
  687. }
  688. { .mmf
  689. nop __LINE__
  690. nop __LINE__
  691. (p13) FMA1 f84 = ALPHA_I, f37, f84
  692. }
  693. { .mmf
  694. nop __LINE__
  695. nop __LINE__
  696. (p13) FMA1 f86 = ALPHA_I, f39, f86
  697. }
  698. { .mmf
  699. nop __LINE__
  700. nop __LINE__
  701. (p13) FMA f81 = ALPHA_I, f32, f81
  702. }
  703. { .mmf
  704. nop __LINE__
  705. nop __LINE__
  706. (p13) FMA f83 = ALPHA_I, f34, f83
  707. }
  708. { .mmf
  709. nop __LINE__
  710. nop __LINE__
  711. (p13) FMA f85 = ALPHA_I, f36, f85
  712. }
  713. { .mmf
  714. nop __LINE__
  715. nop __LINE__
  716. (p13) FMA f87 = ALPHA_I, f38, f87
  717. }
  718. ;;
  719. { .mmf
  720. (p13) STFD [YY1] = f80, 1 * SIZE
  721. (p13) STFD [YY2] = f82, 1 * SIZE
  722. (p14) FMA f88 = ALPHA_R, f40, f88
  723. }
  724. { .mmf
  725. nop __LINE__
  726. nop __LINE__
  727. (p14) FMA f90 = ALPHA_R, f42, f90
  728. }
  729. ;;
  730. { .mmf
  731. (p13) STFD [YY3] = f84, 1 * SIZE
  732. (p13) STFD [YY4] = f86, 1 * SIZE
  733. (p14) FMA2 f89 = ALPHA_R, f41, f89
  734. }
  735. { .mmf
  736. nop __LINE__
  737. nop __LINE__
  738. (p14) FMA2 f91 = ALPHA_R, f43, f91
  739. }
  740. ;;
  741. { .mmf
  742. (p13) STFD [YY1] = f81
  743. (p13) STFD [YY2] = f83
  744. (p15) FMA f92 = ALPHA_R, f44, f92
  745. }
  746. { .mmf
  747. (p13) add YY1 = YY1, INCY4M1
  748. (p13) add YY2 = YY2, INCY4M1
  749. (p15) FMA2 f93 = ALPHA_R, f45, f93
  750. }
  751. ;;
  752. { .mmf
  753. (p13) STFD [YY3] = f85
  754. (p13) STFD [YY4] = f87
  755. (p14) FMA1 f88 = ALPHA_I, f41, f88
  756. }
  757. { .mmf
  758. (p13) add YY3 = YY3, INCY4M1
  759. (p13) add YY4 = YY4, INCY4M1
  760. (p14) FMA1 f90 = ALPHA_I, f43, f90
  761. }
  762. ;;
  763. { .mmf
  764. nop __LINE__
  765. nop __LINE__
  766. (p14) FMA f89 = ALPHA_I, f40, f89
  767. }
  768. { .mmf
  769. nop __LINE__
  770. nop __LINE__
  771. (p14) FMA f91 = ALPHA_I, f42, f91
  772. }
  773. { .mmf
  774. nop __LINE__
  775. nop __LINE__
  776. (p15) FMA1 f92 = ALPHA_I, f45, f92
  777. }
  778. { .mmf
  779. nop __LINE__
  780. nop __LINE__
  781. (p15) FMA f93 = ALPHA_I, f44, f93
  782. }
  783. ;;
  784. { .mmi
  785. (p14) STFD [YY1] = f88, 1 * SIZE
  786. (p14) STFD [YY2] = f90, 1 * SIZE
  787. nop __LINE__
  788. }
  789. ;;
  790. { .mmi
  791. (p14) STFD [YY1] = f89
  792. (p14) STFD [YY2] = f91
  793. (p14) add YY1 = YY1, INCY2M1
  794. }
  795. ;;
  796. { .mmi
  797. (p15) STFD [YY1] = f92, 1 * SIZE
  798. nop __LINE__
  799. nop __LINE__
  800. }
  801. ;;
  802. { .mmb
  803. (p15) STFD [YY1] = f93
  804. nop __LINE__
  805. br.ret.sptk.many b0
  806. }
  807. ;;
  808. EPILOGUE