You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_tcopy.S 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE 24
  41. #define WPREFETCHSIZE 48
  42. #define LD LDF8
  43. #define ST STF8_NTA
  44. #define PREA r2
  45. #define PREB r3
  46. #define I r14
  47. #define J r15
  48. #define A1 r16
  49. #define A2 r17
  50. #define A3 r18
  51. #define A4 r19
  52. #define A5 r20
  53. #define A6 r21
  54. #define A7 r22
  55. #define A8 r23
  56. #define B1 r24
  57. #define B2 r25
  58. #define COUNT r26
  59. #define TEMP r27
  60. #define BO2 r28
  61. #define BO3 r29
  62. #define LDB r8
  63. #define ARLC r30
  64. #define PR r31
  65. #define M r32
  66. #define N r33
  67. #define A r34
  68. #define LDA r35
  69. #define B r36
  70. PROLOGUE
  71. .prologue
  72. PROFCODE
  73. .body
  74. { .mmi
  75. setf.sig f32 = M
  76. and r8 = -4, N
  77. mov ARLC = ar.lc
  78. }
  79. ;;
  80. { .mmi
  81. setf.sig f33 = r8
  82. and r9 = -2, N
  83. mov PR = pr
  84. }
  85. ;;
  86. { .mmi
  87. setf.sig f34 = r9
  88. shladd LDA = LDA, ZBASE_SHIFT, r0
  89. shl LDB = M, BASE_SHIFT + 3
  90. }
  91. ;;
  92. { .mfi
  93. nop __LINE__
  94. xmpy.l f33 = f32, f33
  95. shr J = M, 2
  96. }
  97. { .mfi
  98. nop __LINE__
  99. xmpy.l f34 = f32, f34
  100. nop __LINE__
  101. }
  102. ;;
  103. { .mmb
  104. getf.sig BO2 = f33
  105. getf.sig BO3 = f34
  106. nop __LINE__
  107. }
  108. ;;
  109. { .mmi
  110. shladd BO2 = BO2, ZBASE_SHIFT, B
  111. shladd BO3 = BO3, ZBASE_SHIFT, B
  112. tbit.nz p10, p0 =N, 1
  113. }
  114. { .mib
  115. cmp.eq p6, p0 = 0, J
  116. tbit.nz p11, p0 =N, 0
  117. (p6) br.cond.dpnt .L20
  118. }
  119. ;;
  120. .align 32
  121. .L11:
  122. { .mmi
  123. mov A1 = A
  124. add A2 = A, LDA
  125. mov pr.rot = 0
  126. }
  127. { .mmi
  128. shladd A3 = LDA, 1, A
  129. mov B1 = B
  130. shr I = N, 2
  131. }
  132. ;;
  133. { .mmi
  134. shladd A4 = LDA, 1, A2
  135. cmp.eq p16,p0 = r0, r0
  136. mov ar.ec = 3
  137. }
  138. { .mmi
  139. cmp.eq p6,p0 = 0,I
  140. adds I =-1, I
  141. adds J =-1, J
  142. }
  143. ;;
  144. { .mmi
  145. shladd A = LDA, 2, A
  146. adds A5 = 4 * SIZE, A1
  147. adds A6 = 4 * SIZE, A2
  148. }
  149. { .mmi
  150. adds A7 = 4 * SIZE, A3
  151. adds A8 = 4 * SIZE, A4
  152. adds PREA = PREFETCHSIZE * SIZE,A1
  153. }
  154. ;;
  155. { .mmb
  156. adds B2 = 4 * SIZE, B
  157. adds PREB = WPREFETCHSIZE * SIZE, B
  158. nop __LINE__
  159. }
  160. { .mib
  161. adds B = 32 * SIZE, B
  162. mov ar.lc = I
  163. (p6) br.cond.dpnt.few .L15
  164. }
  165. ;;
  166. .L12:
  167. { .mmb
  168. (p16) lfetch.nt1 [PREA], LDA
  169. (p16) lfetch.excl.nt1 [PREB], LDB
  170. nop __LINE__
  171. }
  172. { .mmb
  173. nop __LINE__
  174. nop __LINE__
  175. nop __LINE__
  176. }
  177. ;;
  178. { .mmb
  179. (p18) ST [B1] = f34, SIZE
  180. (p18) ST [B2] = f37, SIZE
  181. nop __LINE__
  182. }
  183. { .mmb
  184. (p16) LD f32 = [A1], SIZE
  185. (p16) LD f35 = [A5], SIZE
  186. nop __LINE__
  187. }
  188. ;;
  189. { .mmb
  190. (p18) ST [B1] = f40, SIZE
  191. (p18) ST [B2] = f43, SIZE
  192. nop __LINE__
  193. }
  194. { .mmb
  195. (p16) LD f38 = [A1], SIZE
  196. (p16) LD f41 = [A5], SIZE
  197. nop __LINE__
  198. }
  199. ;;
  200. { .mmb
  201. (p18) ST [B1] = f46, SIZE
  202. (p18) ST [B2] = f49, SIZE
  203. nop __LINE__
  204. }
  205. { .mmb
  206. (p16) LD f44 = [A1], SIZE
  207. (p16) LD f47 = [A5], SIZE
  208. nop __LINE__
  209. }
  210. ;;
  211. { .mmi
  212. (p18) ST [B1] = f52, 5 * SIZE
  213. (p18) ST [B2] = f55, 5 * SIZE
  214. tbit.z p0,p7 = COUNT,0
  215. }
  216. { .mmb
  217. (p16) LD f50 = [A1], 5 * SIZE
  218. (p16) LD f53 = [A5], 5 * SIZE
  219. nop __LINE__
  220. }
  221. ;;
  222. { .mmb
  223. (p18) ST [B1] = f58, SIZE
  224. (p18) ST [B2] = f61, SIZE
  225. nop __LINE__
  226. }
  227. { .mmb
  228. (p16) LD f56 = [A2], SIZE
  229. (p16) LD f59 = [A6], SIZE
  230. nop __LINE__
  231. }
  232. ;;
  233. { .mmb
  234. (p18) ST [B1] = f64, SIZE
  235. (p18) ST [B2] = f67, SIZE
  236. nop __LINE__
  237. }
  238. { .mmb
  239. (p16) LD f62 = [A2], SIZE
  240. (p16) LD f65 = [A6], SIZE
  241. nop __LINE__
  242. }
  243. ;;
  244. { .mmb
  245. (p18) ST [B1] = f70, SIZE
  246. (p18) ST [B2] = f73, SIZE
  247. nop __LINE__
  248. }
  249. { .mmb
  250. (p16) LD f68 = [A2], SIZE
  251. (p16) LD f71 = [A6], SIZE
  252. nop __LINE__
  253. }
  254. ;;
  255. { .mmi
  256. (p18) ST [B1] = f76, 5 * SIZE
  257. (p18) ST [B2] = f79, 5 * SIZE
  258. shladd TEMP = LDA, 2, r0
  259. }
  260. { .mmb
  261. (p16) LD f74 = [A2], 5 * SIZE
  262. (p16) LD f77 = [A6], 5 * SIZE
  263. nop __LINE__
  264. }
  265. ;;
  266. { .mmb
  267. (p18) ST [B1] = f82, SIZE
  268. (p18) ST [B2] = f85, SIZE
  269. nop __LINE__
  270. }
  271. { .mmb
  272. (p16) lfetch.nt1 [PREA], LDA
  273. (p16) lfetch.excl.nt1 [PREB], LDB
  274. nop __LINE__
  275. }
  276. ;;
  277. { .mmi
  278. (p18) ST [B1] = f88, SIZE
  279. (p18) ST [B2] = f91, SIZE
  280. adds TEMP = -16 * SIZE, TEMP
  281. }
  282. { .mmb
  283. (p16) LD f80 = [A3], SIZE
  284. (p16) LD f83 = [A7], SIZE
  285. nop __LINE__
  286. }
  287. ;;
  288. { .mmi
  289. (p18) ST [B1] = f94, SIZE
  290. (p18) ST [B2] = f97, SIZE
  291. (p7) sub PREA = PREA, TEMP
  292. }
  293. { .mmb
  294. (p16) LD f86 = [A3], SIZE
  295. (p16) LD f89 = [A7], SIZE
  296. nop __LINE__
  297. }
  298. ;;
  299. { .mmb
  300. (p18) ST [B1] = f100, 5 * SIZE
  301. (p18) ST [B2] = f103, 5 * SIZE
  302. nop __LINE__
  303. }
  304. { .mmb
  305. (p16) LD f92 = [A3], SIZE
  306. (p16) LD f95 = [A7], SIZE
  307. nop __LINE__
  308. }
  309. ;;
  310. { .mmb
  311. (p18) ST [B1] = f106, SIZE
  312. (p18) ST [B2] = f109, SIZE
  313. nop __LINE__
  314. }
  315. { .mmb
  316. (p16) LD f98 = [A3], 5 * SIZE
  317. (p16) LD f101 = [A7], 5 * SIZE
  318. nop __LINE__
  319. }
  320. ;;
  321. { .mmb
  322. (p18) ST [B1] = f112, SIZE
  323. (p18) ST [B2] = f115, SIZE
  324. nop __LINE__
  325. }
  326. { .mmb
  327. (p16) LD f104 = [A4], SIZE
  328. (p16) LD f107 = [A8], SIZE
  329. nop __LINE__
  330. }
  331. ;;
  332. { .mmb
  333. (p18) ST [B1] = f118, SIZE
  334. (p18) ST [B2] = f121, SIZE
  335. nop __LINE__
  336. }
  337. { .mmb
  338. (p16) LD f110 = [A4], SIZE
  339. (p16) LD f113 = [A8], SIZE
  340. nop __LINE__
  341. }
  342. ;;
  343. { .mmi
  344. (p18) ST [B1] = f124, -27 * SIZE
  345. (p18) ST [B2] = f127, -27 * SIZE
  346. (p16) adds COUNT = 1, COUNT
  347. }
  348. { .mmb
  349. (p16) LD f116 = [A4], SIZE
  350. (p16) LD f119 = [A8], SIZE
  351. nop __LINE__
  352. }
  353. ;;
  354. { .mmb
  355. (p18) add B1 = B1, LDB
  356. (p18) add B2 = B2, LDB
  357. nop __LINE__
  358. }
  359. { .mmb
  360. (p16) LD f122 = [A4], 5 * SIZE
  361. (p16) LD f125 = [A8], 5 * SIZE
  362. br.ctop.sptk.few .L12
  363. }
  364. ;;
  365. .align 32
  366. .L15:
  367. { .mmb
  368. (p10) LD f32 = [A1], SIZE
  369. (p10) LD f40 = [A2], SIZE
  370. nop __LINE__
  371. }
  372. ;;
  373. { .mmb
  374. (p10) LD f33 = [A1], SIZE
  375. (p10) LD f41 = [A2], SIZE
  376. nop __LINE__
  377. }
  378. ;;
  379. { .mmb
  380. (p10) LD f34 = [A1], SIZE
  381. (p10) LD f42 = [A2], SIZE
  382. nop __LINE__
  383. }
  384. ;;
  385. { .mmb
  386. (p10) LD f35 = [A1], SIZE
  387. (p10) LD f43 = [A2], SIZE
  388. nop __LINE__
  389. }
  390. ;;
  391. { .mmb
  392. (p10) LD f50 = [A3], SIZE
  393. (p10) LD f60 = [A4], SIZE
  394. nop __LINE__
  395. }
  396. ;;
  397. { .mmb
  398. (p10) LD f51 = [A3], SIZE
  399. (p10) LD f61 = [A4], SIZE
  400. nop __LINE__
  401. }
  402. ;;
  403. { .mmb
  404. (p10) LD f52 = [A3], SIZE
  405. (p10) LD f62 = [A4], SIZE
  406. nop __LINE__
  407. }
  408. ;;
  409. { .mmb
  410. (p10) LD f53 = [A3], SIZE
  411. (p10) LD f63 = [A4], SIZE
  412. nop __LINE__
  413. }
  414. ;;
  415. { .mmb
  416. (p11) LD f36 = [A1], SIZE
  417. (p11) LD f44 = [A2], SIZE
  418. nop __LINE__
  419. }
  420. ;;
  421. { .mmb
  422. (p11) LD f37 = [A1]
  423. (p11) LD f45 = [A2]
  424. nop __LINE__
  425. }
  426. ;;
  427. { .mmb
  428. (p11) LD f54 = [A3], SIZE
  429. (p11) LD f64 = [A4], SIZE
  430. nop __LINE__
  431. }
  432. ;;
  433. { .mmi
  434. (p11) LD f55 = [A3]
  435. (p11) LD f65 = [A4]
  436. adds B2 = 4 * SIZE, BO2
  437. }
  438. ;;
  439. { .mmb
  440. (p10) ST [BO2] = f32, SIZE
  441. (p10) ST [B2] = f40, SIZE
  442. nop __LINE__
  443. }
  444. ;;
  445. { .mmb
  446. (p10) ST [BO2] = f33, SIZE
  447. (p10) ST [B2] = f41, SIZE
  448. nop __LINE__
  449. }
  450. ;;
  451. { .mmb
  452. (p10) ST [BO2] = f34, SIZE
  453. (p10) ST [B2] = f42, SIZE
  454. nop __LINE__
  455. }
  456. ;;
  457. { .mmb
  458. (p10) ST [BO2] = f35, 5 * SIZE
  459. (p10) ST [B2] = f43, 5 * SIZE
  460. nop __LINE__
  461. }
  462. ;;
  463. { .mmb
  464. (p10) ST [BO2] = f50, SIZE
  465. (p10) ST [B2] = f60, SIZE
  466. nop __LINE__
  467. }
  468. ;;
  469. { .mmb
  470. (p10) ST [BO2] = f51, SIZE
  471. (p10) ST [B2] = f61, SIZE
  472. nop __LINE__
  473. }
  474. ;;
  475. { .mmb
  476. (p10) ST [BO2] = f52, SIZE
  477. (p10) ST [B2] = f62, SIZE
  478. nop __LINE__
  479. }
  480. ;;
  481. { .mmi
  482. (p10) ST [BO2] = f53, 5 * SIZE
  483. (p10) ST [B2] = f63
  484. adds B2 = 4 * SIZE, BO3
  485. }
  486. ;;
  487. { .mmb
  488. (p11) ST [BO3] = f36, SIZE
  489. (p11) ST [B2] = f54, SIZE
  490. nop __LINE__
  491. }
  492. ;;
  493. { .mmi
  494. (p11) ST [BO3] = f37, SIZE
  495. (p11) ST [B2] = f55, SIZE
  496. mov COUNT = r0
  497. }
  498. ;;
  499. { .mmi
  500. (p11) ST [BO3] = f44, SIZE
  501. (p11) ST [B2] = f64, SIZE
  502. cmp.eq p0,p6 = 0,J
  503. }
  504. ;;
  505. { .mmb
  506. (p11) ST [BO3] = f45, 5 * SIZE
  507. (p11) ST [B2] = f65, 5 * SIZE
  508. (p6) br.cond.dptk.few .L11
  509. }
  510. ;;
  511. .align 32
  512. .L20:
  513. { .mmi
  514. mov A1 = A
  515. add A2 = A, LDA
  516. mov pr.rot = 0
  517. }
  518. { .mmi
  519. mov B1 = B
  520. adds PREA = PREFETCHSIZE * SIZE,A
  521. tbit.z p6, p0 = M, 1
  522. }
  523. ;;
  524. { .mmi
  525. cmp.eq p16,p0 = r0, r0
  526. adds B2 = 4 * SIZE, B
  527. mov ar.ec = 3
  528. }
  529. { .mib
  530. adds PREB = WPREFETCHSIZE * SIZE, B
  531. shr I = N, 2
  532. (p6) br.cond.dpnt .L30
  533. }
  534. ;;
  535. { .mmi
  536. cmp.eq p6, p0 = 0, I
  537. adds I =-1, I
  538. nop __LINE__
  539. }
  540. { .mmi
  541. shladd A = LDA, 1, A
  542. adds A5 = 4 * SIZE, A1
  543. adds A6 = 4 * SIZE, A2
  544. }
  545. ;;
  546. { .mmb
  547. nop __LINE__
  548. nop __LINE__
  549. nop __LINE__
  550. }
  551. { .mib
  552. adds B = 16 * SIZE, B
  553. mov ar.lc = I
  554. (p6) br.cond.dpnt.few .L25
  555. }
  556. ;;
  557. .L22:
  558. { .mmi
  559. (p16) lfetch.nt1 [PREA], LDA
  560. (p16) lfetch.excl.nt1 [PREB], LDB
  561. shladd TEMP = LDA, 1, r0
  562. }
  563. ;;
  564. { .mmb
  565. (p18) ST [B1] = f34, SIZE
  566. (p18) ST [B2] = f37, SIZE
  567. nop __LINE__
  568. }
  569. { .mmb
  570. (p16) LD f32 = [A1], SIZE
  571. (p16) LD f35 = [A5], SIZE
  572. nop __LINE__
  573. }
  574. ;;
  575. { .mmb
  576. (p18) ST [B1] = f40, SIZE
  577. (p18) ST [B2] = f43, SIZE
  578. nop __LINE__
  579. }
  580. { .mmb
  581. (p16) LD f38 = [A1], SIZE
  582. (p16) LD f41 = [A5], SIZE
  583. nop __LINE__
  584. }
  585. ;;
  586. { .mmb
  587. (p18) ST [B1] = f46, SIZE
  588. (p18) ST [B2] = f49, SIZE
  589. nop __LINE__
  590. }
  591. { .mmb
  592. (p16) LD f44 = [A1], SIZE
  593. (p16) LD f47 = [A5], SIZE
  594. nop __LINE__
  595. }
  596. ;;
  597. { .mmi
  598. (p18) ST [B1] = f52, 5 * SIZE
  599. (p18) ST [B2] = f55, 5 * SIZE
  600. tbit.z p0,p7 = COUNT,0
  601. }
  602. { .mmb
  603. (p16) LD f50 = [A1], 5 * SIZE
  604. (p16) LD f53 = [A5], 5 * SIZE
  605. nop __LINE__
  606. }
  607. ;;
  608. { .mmb
  609. (p18) ST [B1] = f58, SIZE
  610. (p18) ST [B2] = f61, SIZE
  611. nop __LINE__
  612. }
  613. { .mmb
  614. (p16) LD f56 = [A2], SIZE
  615. (p16) LD f59 = [A6], SIZE
  616. nop __LINE__
  617. }
  618. ;;
  619. { .mmi
  620. (p18) ST [B1] = f64, SIZE
  621. (p18) ST [B2] = f67, SIZE
  622. adds TEMP = -16 * SIZE, TEMP
  623. }
  624. { .mmb
  625. (p16) LD f62 = [A2], SIZE
  626. (p16) LD f65 = [A6], SIZE
  627. nop __LINE__
  628. }
  629. ;;
  630. { .mmi
  631. (p18) ST [B1] = f70, SIZE
  632. (p18) ST [B2] = f73, SIZE
  633. (p7) sub PREA = PREA, TEMP
  634. }
  635. { .mmb
  636. (p16) LD f68 = [A2], SIZE
  637. (p16) LD f71 = [A6], SIZE
  638. nop __LINE__
  639. }
  640. ;;
  641. { .mmi
  642. (p18) ST [B1] = f76, -11 * SIZE
  643. (p18) ST [B2] = f79, -11 * SIZE
  644. (p16) adds COUNT = 1, COUNT
  645. }
  646. { .mmb
  647. (p16) LD f74 = [A2], 5 * SIZE
  648. (p16) LD f77 = [A6], 5 * SIZE
  649. nop __LINE__
  650. }
  651. ;;
  652. { .mmb
  653. (p18) add B1 = B1, LDB
  654. (p18) add B2 = B2, LDB
  655. br.ctop.sptk.few .L22
  656. }
  657. ;;
  658. .align 32
  659. .L25:
  660. { .mmb
  661. (p10) LD f32 = [A1], SIZE
  662. (p10) LD f40 = [A2], SIZE
  663. nop __LINE__
  664. }
  665. ;;
  666. { .mmb
  667. (p10) LD f33 = [A1], SIZE
  668. (p10) LD f41 = [A2], SIZE
  669. nop __LINE__
  670. }
  671. ;;
  672. { .mmb
  673. (p10) LD f34 = [A1], SIZE
  674. (p10) LD f42 = [A2], SIZE
  675. nop __LINE__
  676. }
  677. ;;
  678. { .mmb
  679. (p10) LD f35 = [A1], SIZE
  680. (p10) LD f43 = [A2], SIZE
  681. nop __LINE__
  682. }
  683. ;;
  684. { .mmb
  685. (p11) LD f36 = [A1], SIZE
  686. (p11) LD f44 = [A2], SIZE
  687. nop __LINE__
  688. }
  689. ;;
  690. { .mmi
  691. (p11) LD f37 = [A1]
  692. (p11) LD f45 = [A2]
  693. adds B2 = 4 * SIZE, BO2
  694. }
  695. ;;
  696. { .mmb
  697. (p10) ST [BO2] = f32, SIZE
  698. (p10) ST [B2] = f40, SIZE
  699. nop __LINE__
  700. }
  701. ;;
  702. { .mmb
  703. (p10) ST [BO2] = f33, SIZE
  704. (p10) ST [B2] = f41, SIZE
  705. nop __LINE__
  706. }
  707. ;;
  708. { .mmb
  709. (p10) ST [BO2] = f34, SIZE
  710. (p10) ST [B2] = f42, SIZE
  711. nop __LINE__
  712. }
  713. ;;
  714. { .mmb
  715. (p10) ST [BO2] = f35, 5 * SIZE
  716. (p10) ST [B2] = f43, 5 * SIZE
  717. nop __LINE__
  718. }
  719. ;;
  720. { .mmi
  721. (p11) ST [BO3] = f36, SIZE
  722. ;;
  723. (p11) ST [BO3] = f37, SIZE
  724. mov COUNT = r0
  725. }
  726. ;;
  727. { .mmi
  728. (p11) ST [BO3] = f44, SIZE
  729. ;;
  730. (p11) ST [BO3] = f45, SIZE
  731. nop __LINE__
  732. }
  733. ;;
  734. .align 32
  735. .L30:
  736. { .mmi
  737. mov A1 = A
  738. adds A5 = 4 * SIZE, A
  739. mov pr.rot = 0
  740. }
  741. { .mmi
  742. mov B1 = B
  743. adds B2 = 4 * SIZE, B
  744. tbit.z p6, p0 = M, 0
  745. }
  746. ;;
  747. { .mmb
  748. nop __LINE__
  749. nop __LINE__
  750. nop __LINE__
  751. }
  752. { .mib
  753. cmp.eq p16,p0 = r0, r0
  754. shr I = N, 2
  755. (p6) br.cond.dpnt .L999
  756. }
  757. ;;
  758. { .mmi
  759. cmp.eq p6, p0 = 0, I
  760. adds I =-1, I
  761. mov ar.ec = 3
  762. }
  763. ;;
  764. { .mib
  765. nop __LINE__
  766. mov ar.lc = I
  767. (p6) br.cond.dpnt.few .L35
  768. }
  769. ;;
  770. .align 32
  771. .L32:
  772. { .mmb
  773. (p18) ST [B1] = f34, SIZE
  774. (p18) ST [B2] = f37, SIZE
  775. nop __LINE__
  776. }
  777. { .mmb
  778. (p16) LD f32 = [A1], SIZE
  779. (p16) LD f35 = [A5], SIZE
  780. nop __LINE__
  781. }
  782. ;;
  783. { .mmb
  784. (p18) ST [B1] = f40, SIZE
  785. (p18) ST [B2] = f43, SIZE
  786. nop __LINE__
  787. }
  788. { .mmb
  789. (p16) LD f38 = [A1], SIZE
  790. (p16) LD f41 = [A5], SIZE
  791. nop __LINE__
  792. }
  793. ;;
  794. { .mmb
  795. (p18) ST [B1] = f46, SIZE
  796. (p18) ST [B2] = f49, SIZE
  797. nop __LINE__
  798. }
  799. { .mmb
  800. (p16) LD f44 = [A1], SIZE
  801. (p16) LD f47 = [A5], SIZE
  802. nop __LINE__
  803. }
  804. ;;
  805. { .mmi
  806. (p18) ST [B1] = f52, -3 * SIZE
  807. (p18) ST [B2] = f55, -3 * SIZE
  808. nop __LINE__
  809. }
  810. { .mmb
  811. (p16) LD f50 = [A1], 5 * SIZE
  812. (p16) LD f53 = [A5], 5 * SIZE
  813. nop __LINE__
  814. }
  815. ;;
  816. { .mmb
  817. nop __LINE__
  818. nop __LINE__
  819. nop __LINE__
  820. }
  821. { .mmb
  822. (p18) add B1 = B1, LDB
  823. (p18) add B2 = B2, LDB
  824. br.ctop.sptk.few .L32
  825. }
  826. ;;
  827. .align 32
  828. .L35:
  829. { .mmi
  830. (p10) LD f32 = [A1], SIZE
  831. ;;
  832. (p10) LD f33 = [A1], SIZE
  833. nop __LINE__
  834. }
  835. ;;
  836. { .mmi
  837. (p10) LD f34 = [A1], SIZE
  838. ;;
  839. (p10) LD f35 = [A1], SIZE
  840. nop __LINE__
  841. }
  842. ;;
  843. { .mmi
  844. (p11) LD f36 = [A1], SIZE
  845. ;;
  846. (p11) LD f37 = [A1]
  847. nop __LINE__
  848. }
  849. ;;
  850. { .mmi
  851. (p10) ST [BO2] = f32, SIZE
  852. ;;
  853. (p10) ST [BO2] = f33, SIZE
  854. nop __LINE__
  855. }
  856. ;;
  857. { .mmi
  858. (p10) ST [BO2] = f34, SIZE
  859. ;;
  860. (p10) ST [BO2] = f35, SIZE
  861. nop __LINE__
  862. }
  863. ;;
  864. { .mmi
  865. (p11) ST [BO3] = f36, SIZE
  866. ;;
  867. (p11) ST [BO3] = f37, SIZE
  868. nop __LINE__
  869. }
  870. ;;
  871. .align 32
  872. .L999:
  873. mov pr = PR, -1
  874. mov ar.lc = ARLC
  875. br.ret.sptk.many b0
  876. EPILOGUE