You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_ncopy.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE 64
  41. #define WPREFETCHSIZE 32
  42. #define LD LDF8
  43. #define ST STF8_NTA
  44. #define TEMP r2
  45. #define I r14
  46. #define J r15
  47. #define PREB r16
  48. #define PREA r17
  49. #define A1 r18
  50. #define A2 r19
  51. #define A3 r20
  52. #define A4 r21
  53. #define A5 r22
  54. #define A6 r23
  55. #define A7 r24
  56. #define A8 r25
  57. #define B1 r26
  58. #define COUNT r28
  59. #define ARLC r30
  60. #define PR r31
  61. #define M r32
  62. #define N r33
  63. #define A r34
  64. #define LDA r35
  65. #define B r36
  66. PROLOGUE
  67. .prologue
  68. PROFCODE
  69. .body
  70. { .mii
  71. shladd LDA= LDA, ZBASE_SHIFT, r0
  72. mov PR = pr
  73. shr J = N, 2
  74. }
  75. ;;
  76. { .mii
  77. mov COUNT=r0
  78. tbit.nz p10, p0 =M, 1
  79. tbit.nz p11, p0 =M, 0
  80. }
  81. ;;
  82. { .mmb
  83. nop __LINE__
  84. nop __LINE__
  85. nop __LINE__
  86. }
  87. { .mib
  88. cmp.eq p8,p0 = 0, J
  89. mov ARLC = ar.lc
  90. (p8) br.cond.dpnt .L20
  91. }
  92. ;;
  93. .align 32
  94. .L11:
  95. { .mmi
  96. mov A1 = A
  97. add A2 = A, LDA
  98. mov pr.rot = 0
  99. }
  100. { .mmi
  101. shladd A3 = LDA, 1, A
  102. adds B1 = 4 * SIZE, B
  103. shr I = M, 2
  104. }
  105. ;;
  106. { .mmi
  107. shladd A4 = LDA, 1, A2
  108. cmp.eq p16,p0 = r0, r0
  109. mov ar.ec = 3
  110. }
  111. { .mmi
  112. cmp.eq p6,p0 = 0,I
  113. adds I =-1, I
  114. adds J =-1, J
  115. }
  116. ;;
  117. { .mmi
  118. shladd A = LDA, 2, A
  119. adds A5 = 4 * SIZE, A1
  120. adds A6 = 4 * SIZE, A2
  121. }
  122. { .mmi
  123. adds A7 = 4 * SIZE, A3
  124. adds A8 = 4 * SIZE, A4
  125. adds PREA = PREFETCHSIZE * SIZE,A1
  126. }
  127. ;;
  128. { .mmb
  129. nop __LINE__
  130. nop __LINE__
  131. nop __LINE__
  132. }
  133. { .mib
  134. adds PREB = WPREFETCHSIZE * SIZE, B
  135. mov ar.lc = I
  136. (p6) br.cond.dpnt.few .L15
  137. }
  138. ;;
  139. .align 32
  140. .L12:
  141. { .mmb
  142. (p16) lfetch.nt1 [PREA], LDA
  143. (p16) lfetch.excl.nt1 [PREB], 16 * SIZE
  144. nop __LINE__
  145. }
  146. ;;
  147. { .mmb
  148. (p18) ST [B ] = f34, SIZE
  149. (p18) ST [B1] = f82, SIZE
  150. nop __LINE__
  151. }
  152. { .mmb
  153. (p16) LD f32 = [A1], SIZE
  154. (p16) LD f35 = [A5], SIZE
  155. nop __LINE__
  156. }
  157. ;;
  158. { .mmb
  159. (p18) ST [B ] = f40, SIZE
  160. (p18) ST [B1] = f88, SIZE
  161. nop __LINE__
  162. }
  163. { .mmb
  164. (p16) LD f38 = [A1], SIZE
  165. (p16) LD f41 = [A5], SIZE
  166. nop __LINE__
  167. }
  168. ;;
  169. { .mmb
  170. (p18) ST [B ] = f58, SIZE
  171. (p18) ST [B1] = f106, SIZE
  172. nop __LINE__
  173. }
  174. { .mmb
  175. (p16) LD f44 = [A1], SIZE
  176. (p16) LD f47 = [A5], SIZE
  177. nop __LINE__
  178. }
  179. ;;
  180. { .mmi
  181. (p18) ST [B ] = f64, 5 * SIZE
  182. (p18) ST [B1] = f112, 5 * SIZE
  183. tbit.z p0,p7 = COUNT,0
  184. }
  185. { .mmb
  186. (p16) LD f50 = [A1], 5 * SIZE
  187. (p16) LD f53 = [A5], 5 * SIZE
  188. nop __LINE__
  189. }
  190. ;;
  191. { .mmb
  192. (p18) ST [B ] = f46, SIZE
  193. (p18) ST [B1] = f94, SIZE
  194. nop __LINE__
  195. }
  196. { .mmb
  197. (p16) LD f56 = [A2], SIZE
  198. (p16) LD f59 = [A6], SIZE
  199. nop __LINE__
  200. }
  201. ;;
  202. { .mmb
  203. (p18) ST [B ] = f52, SIZE
  204. (p18) ST [B1] = f100, SIZE
  205. nop __LINE__
  206. }
  207. { .mmb
  208. (p16) LD f62 = [A2], SIZE
  209. (p16) LD f65 = [A6], SIZE
  210. nop __LINE__
  211. }
  212. ;;
  213. { .mmb
  214. (p18) ST [B ] = f70, SIZE
  215. (p18) ST [B1] = f118, SIZE
  216. nop __LINE__
  217. }
  218. { .mmb
  219. (p16) LD f68 = [A2], SIZE
  220. (p16) LD f71 = [A6], SIZE
  221. nop __LINE__
  222. }
  223. ;;
  224. { .mmi
  225. (p18) ST [B ] = f76, 5 * SIZE
  226. (p18) ST [B1] = f124, 5 * SIZE
  227. shladd TEMP = LDA, 2, r0
  228. }
  229. { .mmb
  230. (p16) LD f74 = [A2], 5 * SIZE
  231. (p16) LD f77 = [A6], 5 * SIZE
  232. nop __LINE__
  233. }
  234. ;;
  235. { .mmb
  236. (p16) lfetch.nt1 [PREA], LDA
  237. (p16) lfetch.excl.nt1 [PREB], 16 * SIZE
  238. nop __LINE__
  239. }
  240. ;;
  241. { .mmb
  242. (p18) ST [B ] = f37, SIZE
  243. (p18) ST [B1] = f85, SIZE
  244. nop __LINE__
  245. }
  246. { .mmb
  247. (p16) LD f80 = [A3], SIZE
  248. (p16) LD f83 = [A7], SIZE
  249. nop __LINE__
  250. }
  251. ;;
  252. { .mmi
  253. (p18) ST [B ] = f43, SIZE
  254. (p18) ST [B1] = f91, SIZE
  255. adds TEMP = -16 * SIZE, TEMP
  256. }
  257. { .mmb
  258. (p16) LD f86 = [A3], SIZE
  259. (p16) LD f89 = [A7], SIZE
  260. nop __LINE__
  261. }
  262. ;;
  263. { .mmi
  264. (p18) ST [B ] = f61, SIZE
  265. (p18) ST [B1] = f109, SIZE
  266. (p7) sub PREA = PREA, TEMP
  267. }
  268. { .mmb
  269. (p16) LD f92 = [A3], SIZE
  270. (p16) LD f95 = [A7], SIZE
  271. nop __LINE__
  272. }
  273. ;;
  274. { .mmb
  275. (p18) ST [B ] = f67, 5 * SIZE
  276. (p18) ST [B1] = f115, 5 * SIZE
  277. nop __LINE__
  278. }
  279. { .mmb
  280. (p16) LD f98 = [A3], 5 * SIZE
  281. (p16) LD f101 = [A7], 5 * SIZE
  282. nop __LINE__
  283. }
  284. ;;
  285. { .mmb
  286. (p18) ST [B ] = f49, SIZE
  287. (p18) ST [B1] = f97, SIZE
  288. nop __LINE__
  289. }
  290. { .mmb
  291. (p16) LD f104 = [A4], SIZE
  292. (p16) LD f107 = [A8], SIZE
  293. nop __LINE__
  294. }
  295. ;;
  296. { .mmb
  297. (p18) ST [B ] = f55, SIZE
  298. (p18) ST [B1] = f103, SIZE
  299. nop __LINE__
  300. }
  301. { .mmb
  302. (p16) LD f110 = [A4], SIZE
  303. (p16) LD f113 = [A8], SIZE
  304. nop __LINE__
  305. }
  306. ;;
  307. { .mmb
  308. (p18) ST [B ] = f73, SIZE
  309. (p18) ST [B1] = f121, SIZE
  310. nop __LINE__
  311. }
  312. { .mmb
  313. (p16) LD f116 = [A4], SIZE
  314. (p16) LD f119 = [A8], SIZE
  315. nop __LINE__
  316. }
  317. ;;
  318. { .mmi
  319. (p18) ST [B ] = f79, 5 * SIZE
  320. (p18) ST [B1] = f127, 5 * SIZE
  321. (p16) adds COUNT = 1, COUNT
  322. }
  323. { .mmb
  324. (p16) LD f122 = [A4], 5 * SIZE
  325. (p16) LD f125 = [A8], 5 * SIZE
  326. br.ctop.sptk.few .L12
  327. }
  328. ;;
  329. .align 32
  330. .L15:
  331. { .mmb
  332. (p10) LD f32 = [A1], SIZE
  333. (p10) LD f40 = [A2], SIZE
  334. nop __LINE__
  335. }
  336. ;;
  337. { .mmb
  338. (p10) LD f33 = [A1], SIZE
  339. (p10) LD f41 = [A2], SIZE
  340. nop __LINE__
  341. }
  342. ;;
  343. { .mmb
  344. (p10) LD f34 = [A1], SIZE
  345. (p10) LD f42 = [A2], SIZE
  346. nop __LINE__
  347. }
  348. ;;
  349. { .mmb
  350. (p10) LD f35 = [A1], SIZE
  351. (p10) LD f43 = [A2], SIZE
  352. nop __LINE__
  353. }
  354. ;;
  355. { .mmb
  356. (p10) LD f50 = [A3], SIZE
  357. (p10) LD f60 = [A4], SIZE
  358. nop __LINE__
  359. }
  360. ;;
  361. { .mmb
  362. (p10) LD f51 = [A3], SIZE
  363. (p10) LD f61 = [A4], SIZE
  364. nop __LINE__
  365. }
  366. ;;
  367. { .mmb
  368. (p10) LD f52 = [A3], SIZE
  369. (p10) LD f62 = [A4], SIZE
  370. nop __LINE__
  371. }
  372. ;;
  373. { .mmb
  374. (p10) LD f53 = [A3], SIZE
  375. (p10) LD f63 = [A4], SIZE
  376. nop __LINE__
  377. }
  378. ;;
  379. { .mmb
  380. (p11) LD f36 = [A1], SIZE
  381. (p11) LD f44 = [A2], SIZE
  382. nop __LINE__
  383. }
  384. ;;
  385. { .mmb
  386. (p11) LD f37 = [A1]
  387. (p11) LD f45 = [A2]
  388. nop __LINE__
  389. }
  390. ;;
  391. { .mmb
  392. (p11) LD f54 = [A3], SIZE
  393. (p11) LD f64 = [A4], SIZE
  394. nop __LINE__
  395. }
  396. ;;
  397. { .mmb
  398. (p11) LD f55 = [A3]
  399. (p11) LD f65 = [A4]
  400. nop __LINE__
  401. }
  402. ;;
  403. { .mmb
  404. (p10) ST [B ] = f32, SIZE
  405. (p10) ST [B1] = f50, SIZE
  406. nop __LINE__
  407. }
  408. ;;
  409. { .mmb
  410. (p10) ST [B ] = f33, SIZE
  411. (p10) ST [B1] = f51, SIZE
  412. nop __LINE__
  413. }
  414. ;;
  415. { .mmb
  416. (p10) ST [B ] = f40, SIZE
  417. (p10) ST [B1] = f60, SIZE
  418. nop __LINE__
  419. }
  420. ;;
  421. { .mmb
  422. (p10) ST [B ] = f41, 5 * SIZE
  423. (p10) ST [B1] = f61, 5 * SIZE
  424. nop __LINE__
  425. }
  426. ;;
  427. { .mmb
  428. (p10) ST [B ] = f34, SIZE
  429. (p10) ST [B1] = f52, SIZE
  430. nop __LINE__
  431. }
  432. ;;
  433. { .mmb
  434. (p10) ST [B ] = f35, SIZE
  435. (p10) ST [B1] = f53, SIZE
  436. nop __LINE__
  437. }
  438. ;;
  439. { .mmb
  440. (p10) ST [B ] = f42, SIZE
  441. (p10) ST [B1] = f62, SIZE
  442. nop __LINE__
  443. }
  444. ;;
  445. { .mmb
  446. (p10) ST [B ] = f43, 5 * SIZE
  447. (p10) ST [B1] = f63, 5 * SIZE
  448. nop __LINE__
  449. }
  450. ;;
  451. { .mmb
  452. (p11) ST [B ] = f36, SIZE
  453. (p11) ST [B1] = f54, SIZE
  454. nop __LINE__
  455. }
  456. ;;
  457. { .mmi
  458. (p11) ST [B ] = f37, SIZE
  459. (p11) ST [B1] = f55, SIZE
  460. mov COUNT = r0
  461. }
  462. ;;
  463. { .mmi
  464. (p11) ST [B ] = f44, SIZE
  465. (p11) ST [B1] = f64, SIZE
  466. cmp.eq p0,p6 = 0,J
  467. }
  468. ;;
  469. { .mmb
  470. (p11) ST [B ] = f45, 5 * SIZE
  471. (p11) ST [B1] = f65, 5 * SIZE
  472. (p6) br.cond.dptk.few .L11
  473. }
  474. ;;
  475. .align 32
  476. .L20:
  477. { .mmi
  478. mov A1 = A
  479. add A2 = A,LDA
  480. mov pr.rot = 0
  481. }
  482. { .mmi
  483. adds A5 = 4 * SIZE, A
  484. adds B1 = 4 * SIZE, B
  485. tbit.z p8, p0 = N, 1
  486. }
  487. ;;
  488. { .mmi
  489. cmp.eq p16,p0 = r0,r0
  490. adds PREA = PREFETCHSIZE * SIZE, A
  491. mov ar.ec = 3
  492. }
  493. ;;
  494. { .mib
  495. adds PREB = WPREFETCHSIZE * SIZE,B
  496. shr I = M, 2
  497. (p8) br.cond.dpnt.few .L30
  498. }
  499. ;;
  500. { .mmi
  501. shladd A = LDA, 1, A
  502. cmp.eq p6, p0 = 0, I
  503. adds I = -1, I
  504. }
  505. ;;
  506. { .mib
  507. adds A6 = 4 * SIZE, A2
  508. mov ar.lc = I
  509. (p6) br.cond.dpnt.few .L25
  510. }
  511. ;;
  512. .align 32
  513. .L21:
  514. { .mmb
  515. (p16) lfetch.nt1 [PREA],LDA
  516. (p16) lfetch.excl.nt1 [PREB ],16 * SIZE
  517. nop __LINE__
  518. }
  519. { .mmb
  520. nop __LINE__
  521. nop __LINE__
  522. nop __LINE__
  523. }
  524. ;;
  525. { .mmb
  526. (p18) ST [B ] = f34, SIZE
  527. (p18) ST [B1] = f46, SIZE
  528. nop __LINE__
  529. }
  530. { .mmb
  531. (p16) LD f32 = [A1], SIZE
  532. (p16) LD f35 = [A5], SIZE
  533. nop __LINE__
  534. }
  535. ;;
  536. { .mmb
  537. (p18) ST [B ] = f40, SIZE
  538. (p18) ST [B1] = f52, SIZE
  539. nop __LINE__
  540. }
  541. { .mmb
  542. (p16) LD f38 = [A1], SIZE
  543. (p16) LD f41 = [A5], SIZE
  544. nop __LINE__
  545. }
  546. ;;
  547. { .mmb
  548. (p18) ST [B ] = f58, SIZE
  549. (p18) ST [B1] = f70, SIZE
  550. nop __LINE__
  551. }
  552. { .mmb
  553. (p16) LD f44 = [A1], SIZE
  554. (p16) LD f47 = [A5], SIZE
  555. nop __LINE__
  556. }
  557. ;;
  558. { .mmi
  559. (p18) ST [B ] = f64, 5 * SIZE
  560. (p18) ST [B1] = f76, 5 * SIZE
  561. tbit.z p0,p7 = COUNT,0
  562. }
  563. { .mmb
  564. (p16) LD f50 = [A1], 5 * SIZE
  565. (p16) LD f53 = [A5], 5 * SIZE
  566. nop __LINE__
  567. }
  568. ;;
  569. { .mmi
  570. (p18) ST [B ] = f37, SIZE
  571. (p18) ST [B1] = f49, SIZE
  572. adds TEMP = -16 * SIZE,TEMP
  573. }
  574. { .mmb
  575. (p16) LD f56 = [A2], SIZE
  576. (p16) LD f59 = [A6], SIZE
  577. nop __LINE__
  578. }
  579. ;;
  580. { .mmi
  581. (p18) ST [B ] = f43, SIZE
  582. (p18) ST [B1] = f55, SIZE
  583. (p7) sub PREA = PREA,TEMP
  584. }
  585. { .mmb
  586. (p16) LD f62 = [A2], SIZE
  587. (p16) LD f65 = [A6], SIZE
  588. nop __LINE__
  589. }
  590. ;;
  591. { .mmi
  592. (p18) ST [B ] = f61, SIZE
  593. (p18) ST [B1] = f73, SIZE
  594. (p16) adds COUNT = 1,COUNT
  595. }
  596. { .mmb
  597. (p16) LD f68 = [A2], SIZE
  598. (p16) LD f71 = [A6], SIZE
  599. nop __LINE__
  600. }
  601. ;;
  602. { .mmi
  603. (p18) ST [B ] = f67, 5 * SIZE
  604. (p18) ST [B1] = f79, 5 * SIZE
  605. shladd TEMP = LDA,2,r0
  606. }
  607. { .mmb
  608. (p16) LD f74 = [A2], 5 * SIZE
  609. (p16) LD f77 = [A6], 5 * SIZE
  610. br.ctop.sptk.few .L21
  611. }
  612. ;;
  613. .align 32
  614. .L25:
  615. { .mmb
  616. (p10) LD f32 = [A1], SIZE
  617. (p10) LD f40 = [A2], SIZE
  618. nop __LINE__
  619. }
  620. ;;
  621. { .mmb
  622. (p10) LD f33 = [A1], SIZE
  623. (p10) LD f41 = [A2], SIZE
  624. nop __LINE__
  625. }
  626. ;;
  627. { .mmb
  628. (p10) LD f34 = [A1], SIZE
  629. (p10) LD f42 = [A2], SIZE
  630. nop __LINE__
  631. }
  632. ;;
  633. { .mmb
  634. (p10) LD f35 = [A1], SIZE
  635. (p10) LD f43 = [A2], SIZE
  636. nop __LINE__
  637. }
  638. ;;
  639. { .mmb
  640. (p11) LD f36 = [A1], SIZE
  641. (p11) LD f44 = [A2], SIZE
  642. nop __LINE__
  643. }
  644. ;;
  645. { .mmb
  646. (p11) LD f37 = [A1]
  647. (p11) LD f45 = [A2]
  648. nop __LINE__
  649. }
  650. ;;
  651. { .mmb
  652. (p10) ST [B ] = f32, SIZE
  653. (p10) ST [B1] = f34, SIZE
  654. nop __LINE__
  655. }
  656. ;;
  657. { .mmb
  658. (p10) ST [B ] = f33, SIZE
  659. (p10) ST [B1] = f35, SIZE
  660. nop __LINE__
  661. }
  662. ;;
  663. { .mmb
  664. (p10) ST [B ] = f40, SIZE
  665. (p10) ST [B1] = f42, SIZE
  666. nop __LINE__
  667. }
  668. ;;
  669. { .mmb
  670. (p10) ST [B ] = f41, 5 * SIZE
  671. (p10) ST [B1] = f43, 5 * SIZE
  672. nop __LINE__
  673. }
  674. ;;
  675. { .mmi
  676. (p11) ST [B ] = f36, SIZE
  677. ;;
  678. (p11) ST [B ] = f37, SIZE
  679. nop __LINE__
  680. }
  681. ;;
  682. { .mmi
  683. (p11) ST [B ] = f44, SIZE
  684. ;;
  685. (p11) ST [B ] = f45, SIZE
  686. nop __LINE__
  687. }
  688. ;;
  689. .align 32
  690. .L30:
  691. { .mmi
  692. mov A1 = A
  693. mov COUNT = r0
  694. mov pr.rot = 0
  695. }
  696. { .mmi
  697. adds A5 = 4 * SIZE,A
  698. adds B1 = 4 * SIZE,B
  699. tbit.z p8,p0 = N,0
  700. }
  701. ;;
  702. { .mmi
  703. cmp.eq p16,p0 = r0,r0
  704. nop __LINE__
  705. mov ar.ec = 3
  706. }
  707. { .mib
  708. nop __LINE__
  709. shr I = M,2
  710. (p8) br.cond.dptk.few .L999
  711. }
  712. ;;
  713. { .mmi
  714. cmp.eq p6 ,p0 = 0, I
  715. adds PREA = PREFETCHSIZE * SIZE, A
  716. adds I = -1, I
  717. }
  718. ;;
  719. { .mib
  720. adds PREB = WPREFETCHSIZE * SIZE, B
  721. mov ar.lc = I
  722. (p6) br.cond.dpnt.few .L35
  723. }
  724. ;;
  725. .align 32
  726. .L31:
  727. { .mmi
  728. (p16) lfetch.nt1 [PREA], LDA
  729. (p16) lfetch.excl.nt1 [PREB ], 16 * SIZE
  730. tbit.z p0, p7 = COUNT, 0
  731. }
  732. { .mmb
  733. nop __LINE__
  734. nop __LINE__
  735. nop __LINE__
  736. }
  737. ;;
  738. { .mmi
  739. (p18) ST [B ] = f34, SIZE
  740. (p18) ST [B1] = f37, SIZE
  741. shladd TEMP = LDA,2,r0
  742. }
  743. { .mmb
  744. (p16) LD f32 = [A1], SIZE
  745. (p16) LD f35 = [A5], SIZE
  746. nop __LINE__
  747. }
  748. ;;
  749. { .mmi
  750. (p18) ST [B ] = f40, SIZE
  751. (p18) ST [B1] = f43, SIZE
  752. adds TEMP = -16 * SIZE,TEMP
  753. }
  754. { .mmb
  755. (p16) LD f38 = [A1], SIZE
  756. (p16) LD f41 = [A5], SIZE
  757. nop __LINE__
  758. }
  759. ;;
  760. { .mmb
  761. (p18) ST [B ] = f46, SIZE
  762. (p18) ST [B1] = f49, SIZE
  763. nop __LINE__
  764. }
  765. { .mmi
  766. (p16) LD f44 = [A1], SIZE
  767. (p16) LD f47 = [A5], SIZE
  768. (p7) sub PREA = PREA,TEMP
  769. }
  770. ;;
  771. { .mmi
  772. (p18) ST [B ] = f52, 5 * SIZE
  773. (p18) ST [B1] = f55, 5 * SIZE
  774. (p16) adds COUNT = 1,COUNT
  775. }
  776. { .mmb
  777. (p16) LD f50 = [A1], 5 * SIZE
  778. (p16) LD f53 = [A5], 5 * SIZE
  779. br.ctop.sptk.few .L31
  780. }
  781. ;;
  782. .align 32
  783. .L35:
  784. { .mmi
  785. (p10) LD f32 = [A1], SIZE
  786. ;;
  787. (p10) LD f33 = [A1], SIZE
  788. nop __LINE__
  789. }
  790. ;;
  791. { .mmi
  792. (p10) LD f34 = [A1], SIZE
  793. ;;
  794. (p10) LD f35 = [A1], SIZE
  795. nop __LINE__
  796. }
  797. ;;
  798. { .mmi
  799. (p11) LD f36 = [A1], SIZE
  800. ;;
  801. (p11) LD f37 = [A1]
  802. nop __LINE__
  803. }
  804. ;;
  805. { .mmi
  806. (p10) ST [B ] = f32, SIZE
  807. ;;
  808. (p10) ST [B ] = f33, SIZE
  809. nop __LINE__
  810. }
  811. ;;
  812. { .mmi
  813. (p10) ST [B ] = f34, SIZE
  814. ;;
  815. (p10) ST [B ] = f35, SIZE
  816. nop __LINE__
  817. }
  818. ;;
  819. { .mmi
  820. (p11) ST [B ] = f36, SIZE
  821. ;;
  822. (p11) ST [B ] = f37, SIZE
  823. nop __LINE__
  824. }
  825. ;;
  826. .align 32
  827. .L999:
  828. mov pr = PR,-1
  829. mov ar.lc = ARLC
  830. br.ret.sptk.many b0
  831. ;;
  832. EPILOGUE