You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_tcopy.S 28 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE 24
  41. #define WPREFETCHSIZE 32
  42. #ifndef XDOUBLE
  43. #define LD LDFD
  44. #define ST STFD_NTA
  45. #else
  46. #define LD LDFD
  47. #define ST STFD_NTA
  48. #endif
  49. #define PREA r2
  50. #define PREB r3
  51. #define A1 r14
  52. #define A2 r15
  53. #define B1 r16
  54. #define B2 r17
  55. #define I r18
  56. #define J r19
  57. #define BO2 r20
  58. #define BO3 r21
  59. #define BO4 r22
  60. #define LDB r23
  61. #define II r24
  62. #define TEMP1 r25
  63. #define TEMP2 r26
  64. #define TEMP3 r27
  65. #define LCOUNT r28
  66. #define SCOUNT r29
  67. #define ARLC r30
  68. #define PR r31
  69. #define MLDA8 r8
  70. #define M r32
  71. #define N r33
  72. #define A r34
  73. #define LDA r35
  74. #define B r36
  75. PROLOGUE
  76. .prologue
  77. PROFCODE
  78. .body
  79. { .mmi
  80. setf.sig f32 = M
  81. and r8 = -8, N
  82. mov ARLC = ar.lc
  83. }
  84. ;;
  85. { .mmi
  86. setf.sig f33 = r8
  87. and r9 = -4, N
  88. mov PR = pr
  89. }
  90. ;;
  91. { .mmi
  92. setf.sig f34 = r9
  93. and r10 = -2, N
  94. shladd LDA = LDA, BASE_SHIFT, r0
  95. }
  96. ;;
  97. { .mmi
  98. setf.sig f35 = r10
  99. shladd MLDA8 = LDA, 3, r0
  100. shl LDB = M, BASE_SHIFT + 3
  101. }
  102. ;;
  103. { .mfi
  104. sub MLDA8 = r0, MLDA8
  105. xmpy.l f33 = f32, f33
  106. shr J = M, 3
  107. }
  108. { .mfi
  109. xmpy.l f34 = f32, f34
  110. }
  111. ;;
  112. { .mmf
  113. getf.sig BO2 = f33
  114. adds MLDA8 = 16 * SIZE, MLDA8
  115. xmpy.l f35 = f32, f35
  116. }
  117. ;;
  118. { .mmi
  119. getf.sig BO3 = f34
  120. getf.sig BO4 = f35
  121. nop __LINE__
  122. }
  123. ;;
  124. { .mmi
  125. shladd BO2 = BO2, BASE_SHIFT, B
  126. shladd BO3 = BO3, BASE_SHIFT, B
  127. shladd BO4 = BO4, BASE_SHIFT, B
  128. }
  129. { .mib
  130. cmp.eq p6, p0 = 0, J
  131. nop __LINE__
  132. (p6) br.cond.dpnt .L100
  133. }
  134. ;;
  135. .align 32
  136. .L11:
  137. { .mmi
  138. add I = 8, N
  139. mov A1 = A
  140. mov pr.rot = 0
  141. }
  142. { .mmi
  143. adds A2 = 4 * SIZE, A
  144. shladd A = LDA, 3, A
  145. shr II = N, 3
  146. }
  147. ;;
  148. { .mmi
  149. mov B1 = B
  150. cmp.eq p16, p0 = r0, r0
  151. mov ar.ec = 3
  152. }
  153. { .mmi
  154. adds B2 = 4 * SIZE, B
  155. adds B = 64 * SIZE, B
  156. shr I = I, 4
  157. }
  158. ;;
  159. { .mmi
  160. cmp.eq p8, p0 = 0, I
  161. shladd I = I, 2, r0
  162. nop __LINE__
  163. }
  164. ;;
  165. { .mmi
  166. mov LCOUNT = 0
  167. mov SCOUNT = 0
  168. adds I = -1, I
  169. }
  170. ;;
  171. { .mmi
  172. adds PREA = PREFETCHSIZE * SIZE, A1
  173. adds PREB = WPREFETCHSIZE * SIZE, B1
  174. mov ar.lc = I
  175. }
  176. { .mib
  177. adds J = -1, J
  178. mov I = II
  179. (p8) br.cond.dpnt .L20
  180. }
  181. ;;
  182. .align 32
  183. .L12:
  184. { .mmi
  185. (p18) ST [B1] = f34, 1 * SIZE
  186. (p18) ST [B2] = f46, 1 * SIZE
  187. (p18) cmp.ne.unc p13, p0 = 1, II
  188. }
  189. { .mmi
  190. (p16) lfetch.nt1 [PREA], LDA
  191. (p16) lfetch.excl.nt1 [PREB], LDB
  192. (p16) cmp.ne.unc p12, p0 = 1, I
  193. }
  194. ;;
  195. { .mmi
  196. (p18) ST [B1] = f37, 1 * SIZE
  197. (p18) ST [B2] = f49, 1 * SIZE
  198. (p18) adds SCOUNT = 1, SCOUNT
  199. }
  200. { .mmi
  201. (p16) LD f32 = [A1], SIZE
  202. (p16) LD f44 = [A2], SIZE
  203. (p16) adds LCOUNT = 1, LCOUNT
  204. }
  205. ;;
  206. { .mmi
  207. (p18) ST [B1] = f40, 1 * SIZE
  208. (p18) ST [B2] = f52, 1 * SIZE
  209. (p16) cmp.eq.unc p14, p0 = 4, LCOUNT
  210. }
  211. { .mmi
  212. (p16) LD f35 = [A1], SIZE
  213. (p16) LD f47 = [A2], SIZE
  214. adds TEMP1 = -3 * SIZE, LDA
  215. }
  216. ;;
  217. { .mmi
  218. (p18) ST [B1] = f43, 5 * SIZE
  219. (p18) ST [B2] = f55, 5 * SIZE
  220. (p18) cmp.eq.unc p15, p0 = 4, SCOUNT
  221. }
  222. { .mmi
  223. (p16) LD f38 = [A1], SIZE
  224. (p16) LD f50 = [A2], SIZE
  225. (p12) mov TEMP1 = 5 * SIZE
  226. }
  227. ;;
  228. { .mmi
  229. (p18) ST [B1] = f82, 1 * SIZE
  230. (p18) ST [B2] = f94, 1 * SIZE
  231. }
  232. { .mmi
  233. (p16) LD f41 = [A1], TEMP1
  234. (p16) LD f53 = [A2], TEMP1
  235. }
  236. ;;
  237. { .mmi
  238. (p18) ST [B1] = f85, 1 * SIZE
  239. (p18) ST [B2] = f97, 1 * SIZE
  240. mov TEMP2 = 5 * SIZE
  241. }
  242. { .mmi
  243. (p12) LD f56 = [A1], SIZE
  244. (p12) LD f68 = [A2], SIZE
  245. shladd TEMP3 = LDA, 3, r0
  246. }
  247. ;;
  248. { .mmi
  249. (p18) ST [B1] = f88, 1 * SIZE
  250. (p18) ST [B2] = f100, 1 * SIZE
  251. (p13) adds TEMP2 = - 11 * SIZE, LDB
  252. }
  253. { .mmi
  254. (p12) LD f59 = [A1], SIZE
  255. (p12) LD f71 = [A2], SIZE
  256. (p12) adds TEMP1 = - 11 * SIZE, LDA
  257. }
  258. ;;
  259. { .mmi
  260. (p18) ST [B1] = f91
  261. (p18) ST [B2] = f103
  262. (p18) add B1 = B1, TEMP2
  263. }
  264. { .mmi
  265. (p12) LD f62 = [A1], SIZE
  266. (p12) LD f74 = [A2], SIZE
  267. (p18) add B2 = B2, TEMP2
  268. }
  269. ;;
  270. { .mmi
  271. (p13) ST [B1] = f58, 1 * SIZE
  272. (p13) ST [B2] = f70, 1 * SIZE
  273. }
  274. { .mmi
  275. (p12) LD f65 = [A1], TEMP1
  276. (p12) LD f77 = [A2], TEMP1
  277. sub TEMP3 = LDA, TEMP3
  278. }
  279. ;;
  280. { .mmi
  281. (p13) ST [B1] = f61, 1 * SIZE
  282. (p13) ST [B2] = f73, 1 * SIZE
  283. }
  284. { .mmi
  285. (p16) lfetch.nt1 [PREA], LDA
  286. (p16) lfetch.excl.nt1 [PREB]
  287. adds TEMP3 = 5 * SIZE, TEMP3
  288. }
  289. ;;
  290. { .mmi
  291. (p13) ST [B1] = f64, 1 * SIZE
  292. (p13) ST [B2] = f76, 1 * SIZE
  293. }
  294. { .mmi
  295. (p16) LD f80 = [A1], SIZE
  296. (p16) LD f92 = [A2], SIZE
  297. adds TEMP1 = -3 * SIZE, LDA
  298. }
  299. ;;
  300. { .mmi
  301. (p13) ST [B1] = f67, 5 * SIZE
  302. (p13) ST [B2] = f79, 5 * SIZE
  303. }
  304. { .mmi
  305. (p16) LD f83 = [A1], SIZE
  306. (p16) LD f95 = [A2], SIZE
  307. (p14) mov TEMP1 = TEMP3
  308. }
  309. ;;
  310. { .mmi
  311. (p13) ST [B1] = f106, 1 * SIZE
  312. (p13) ST [B2] = f118, 1 * SIZE
  313. mov TEMP2 = 5 * SIZE
  314. }
  315. { .mmi
  316. (p16) LD f86 = [A1], SIZE
  317. (p16) LD f98 = [A2], SIZE
  318. (p12) mov TEMP1 = 5 * SIZE
  319. }
  320. ;;
  321. { .mmi
  322. (p13) ST [B1] = f109, 1 * SIZE
  323. (p13) ST [B2] = f121, 1 * SIZE
  324. sub TEMP2 = TEMP2, LDB
  325. }
  326. { .mmi
  327. (p16) LD f89 = [A1], TEMP1
  328. (p16) LD f101 = [A2], TEMP1
  329. }
  330. ;;
  331. { .mmi
  332. (p13) ST [B1] = f112, 1 * SIZE
  333. (p13) ST [B2] = f124, 1 * SIZE
  334. (p15) adds TEMP2 = -59 * SIZE, LDB
  335. }
  336. { .mmi
  337. (p12) LD f104 = [A1], SIZE
  338. (p12) LD f116 = [A2], SIZE
  339. (p14) add PREA = PREA, MLDA8
  340. }
  341. ;;
  342. { .mmi
  343. (p13) ST [B1] = f115
  344. (p13) ST [B2] = f127
  345. (p13) add B1 = B1, TEMP2
  346. }
  347. { .mmi
  348. (p12) LD f107 = [A1], SIZE
  349. (p12) LD f119 = [A2], SIZE
  350. adds TEMP1 = -11 * SIZE, LDA
  351. }
  352. ;;
  353. { .mmi
  354. (p12) LD f110 = [A1], SIZE
  355. (p12) LD f122 = [A2], SIZE
  356. (p14) mov TEMP1 = TEMP3
  357. }
  358. { .mmi
  359. (p14) mov LCOUNT = 0
  360. (p15) mov SCOUNT = 0
  361. adds PREB = WPREFETCHSIZE * SIZE, B1
  362. }
  363. ;;
  364. { .mmi
  365. (p12) LD f113 = [A1], TEMP1
  366. (p12) LD f125 = [A2], TEMP1
  367. (p13) add B2 = B2, TEMP2
  368. }
  369. { .mib
  370. (p14) adds I = -2, I
  371. (p15) adds II = -2, II
  372. br.ctop.sptk .L12
  373. }
  374. ;;
  375. .align 32
  376. .L20:
  377. { .mmi
  378. add A2 = A1, LDA
  379. and TEMP3 = 7, N
  380. tbit.nz p7, p0 = N, 2
  381. }
  382. ;;
  383. { .mmi
  384. (p7) LD f32 = [A1], SIZE
  385. (p7) LD f36 = [A2], SIZE
  386. cmp.eq p6, p0 = 0, TEMP3
  387. }
  388. ;;
  389. { .mmi
  390. (p7) LD f33 = [A1], SIZE
  391. (p7) LD f37 = [A2], SIZE
  392. adds TEMP1 = -3 * SIZE, LDA
  393. }
  394. ;;
  395. { .mmi
  396. (p7) LD f34 = [A1], SIZE
  397. (p7) LD f38 = [A2], SIZE
  398. add TEMP1 = TEMP1, LDA
  399. }
  400. ;;
  401. { .mmi
  402. (p7) LD f35 = [A1], TEMP1
  403. (p7) LD f39 = [A2], TEMP1
  404. (p6) cmp.ne.unc p10, p0 = 0, J
  405. }
  406. ;;
  407. { .mmb
  408. (p7) LD f40 = [A1], SIZE
  409. (p7) LD f44 = [A2], SIZE
  410. (p10) br.cond.dptk .L11
  411. }
  412. ;;
  413. { .mmi
  414. (p7) LD f41 = [A1], SIZE
  415. (p7) LD f45 = [A2], SIZE
  416. nop __LINE__
  417. }
  418. ;;
  419. { .mmi
  420. (p7) LD f42 = [A1], SIZE
  421. (p7) LD f46 = [A2], SIZE
  422. tbit.nz p8, p0 = N, 1
  423. }
  424. ;;
  425. { .mmi
  426. (p7) LD f43 = [A1], TEMP1
  427. (p7) LD f47 = [A2], TEMP1
  428. adds B2 = 4 * SIZE, BO2
  429. }
  430. ;;
  431. { .mmi
  432. (p7) ST [BO2] = f32, 1 * SIZE
  433. (p7) ST [B2 ] = f36, 1 * SIZE
  434. tbit.nz p9, p0 = N, 0
  435. }
  436. { .mmi
  437. (p7) LD f48 = [A1], SIZE
  438. (p7) LD f52 = [A2], SIZE
  439. nop __LINE__
  440. }
  441. ;;
  442. { .mmi
  443. (p7) ST [BO2] = f33, 1 * SIZE
  444. (p7) ST [B2 ] = f37, 1 * SIZE
  445. nop __LINE__
  446. }
  447. { .mmi
  448. (p7) LD f49 = [A1], SIZE
  449. (p7) LD f53 = [A2], SIZE
  450. nop __LINE__
  451. }
  452. ;;
  453. { .mmi
  454. (p7) ST [BO2] = f34, 1 * SIZE
  455. (p7) ST [B2 ] = f38, 1 * SIZE
  456. nop __LINE__
  457. }
  458. { .mmi
  459. (p7) LD f50 = [A1], SIZE
  460. (p7) LD f54 = [A2], SIZE
  461. nop __LINE__
  462. }
  463. ;;
  464. { .mmi
  465. (p7) ST [BO2] = f35, 5 * SIZE
  466. (p7) ST [B2 ] = f39, 5 * SIZE
  467. nop __LINE__
  468. }
  469. { .mmi
  470. (p7) LD f51 = [A1], TEMP1
  471. (p7) LD f55 = [A2], TEMP1
  472. mov TEMP1 = -1 * SIZE
  473. }
  474. ;;
  475. { .mmi
  476. (p7) ST [BO2] = f40, 1 * SIZE
  477. (p7) ST [B2 ] = f44, 1 * SIZE
  478. nop __LINE__
  479. }
  480. { .mmi
  481. (p7) LD f56 = [A1], SIZE
  482. (p7) LD f60 = [A2], SIZE
  483. shladd TEMP1 = LDA, 3, TEMP1
  484. }
  485. ;;
  486. { .mmi
  487. (p7) ST [BO2] = f41, 1 * SIZE
  488. (p7) ST [B2 ] = f45, 1 * SIZE
  489. nop __LINE__
  490. }
  491. { .mmi
  492. (p7) LD f57 = [A1], SIZE
  493. (p7) LD f61 = [A2], SIZE
  494. sub TEMP1 = 0, TEMP1
  495. }
  496. ;;
  497. { .mmi
  498. (p7) ST [BO2] = f42, 1 * SIZE
  499. (p7) ST [B2 ] = f46, 1 * SIZE
  500. nop __LINE__
  501. }
  502. { .mmi
  503. (p7) LD f58 = [A1], SIZE
  504. (p7) LD f62 = [A2], SIZE
  505. shladd TEMP1 = LDA, 1, TEMP1
  506. }
  507. ;;
  508. { .mmi
  509. (p7) ST [BO2] = f43, 5 * SIZE
  510. (p7) ST [B2 ] = f47, 5 * SIZE
  511. nop __LINE__
  512. }
  513. { .mmi
  514. (p7) LD f59 = [A1], TEMP1
  515. (p7) LD f63 = [A2], TEMP1
  516. nop __LINE__
  517. }
  518. ;;
  519. { .mmi
  520. (p7) ST [BO2] = f48, 1 * SIZE
  521. (p7) ST [B2 ] = f52, 1 * SIZE
  522. nop __LINE__
  523. }
  524. { .mmi
  525. add A2 = A1, LDA
  526. adds TEMP1 = -1 * SIZE, LDA
  527. nop __LINE__
  528. }
  529. ;;
  530. { .mmi
  531. (p7) ST [BO2] = f49, 1 * SIZE
  532. (p7) ST [B2 ] = f53, 1 * SIZE
  533. nop __LINE__
  534. }
  535. { .mmi
  536. (p8) LD f64 = [A1], SIZE
  537. (p8) LD f66 = [A2], SIZE
  538. add TEMP1 = TEMP1, LDA
  539. }
  540. ;;
  541. { .mmi
  542. (p7) ST [BO2] = f50, 1 * SIZE
  543. (p7) ST [B2 ] = f54, 1 * SIZE
  544. nop __LINE__
  545. }
  546. { .mmi
  547. (p8) LD f65 = [A1], TEMP1
  548. (p8) LD f67 = [A2], TEMP1
  549. nop __LINE__
  550. }
  551. ;;
  552. { .mmi
  553. (p7) ST [BO2] = f51, 5 * SIZE
  554. (p7) ST [B2 ] = f55, 5 * SIZE
  555. nop __LINE__
  556. }
  557. { .mmi
  558. (p8) LD f68 = [A1], SIZE
  559. (p8) LD f70 = [A2], SIZE
  560. nop __LINE__
  561. }
  562. ;;
  563. { .mmi
  564. (p7) ST [BO2] = f56, 1 * SIZE
  565. (p7) ST [B2 ] = f60, 1 * SIZE
  566. nop __LINE__
  567. }
  568. { .mmi
  569. (p8) LD f69 = [A1], TEMP1
  570. (p8) LD f71 = [A2], TEMP1
  571. mov TEMP3 = -1 * SIZE
  572. }
  573. ;;
  574. { .mmi
  575. (p7) ST [BO2] = f57, 1 * SIZE
  576. (p7) ST [B2 ] = f61, 1 * SIZE
  577. nop __LINE__
  578. }
  579. { .mmi
  580. (p8) LD f72 = [A1], SIZE
  581. (p8) LD f74 = [A2], SIZE
  582. shladd TEMP3 = LDA, 3, TEMP3
  583. }
  584. ;;
  585. { .mmi
  586. (p7) ST [BO2] = f58, 1 * SIZE
  587. (p7) ST [B2 ] = f62, 1 * SIZE
  588. nop __LINE__
  589. }
  590. { .mmi
  591. (p8) LD f73 = [A1], TEMP1
  592. (p8) LD f75 = [A2], TEMP1
  593. sub TEMP3 = 0, TEMP3
  594. }
  595. ;;
  596. { .mmi
  597. (p7) ST [BO2] = f59, 5 * SIZE
  598. (p7) ST [B2 ] = f63
  599. adds B2 = 4 * SIZE, BO3
  600. }
  601. { .mmi
  602. (p8) LD f76 = [A1], SIZE
  603. (p8) LD f78 = [A2], SIZE
  604. shladd TEMP3 = LDA, 1, TEMP3
  605. }
  606. ;;
  607. { .mmi
  608. (p8) ST [BO3] = f64, 1 * SIZE
  609. (p8) ST [B2 ] = f68, 1 * SIZE
  610. nop __LINE__
  611. }
  612. { .mmi
  613. (p8) LD f77 = [A1], TEMP3
  614. (p8) LD f79 = [A2], TEMP3
  615. nop __LINE__
  616. }
  617. ;;
  618. { .mmi
  619. (p8) ST [BO3] = f65, 1 * SIZE
  620. (p8) ST [B2 ] = f69, 1 * SIZE
  621. nop __LINE__
  622. }
  623. { .mmi
  624. add A2 = A1, LDA
  625. shladd TEMP3 = LDA, 1, r0
  626. nop __LINE__
  627. }
  628. ;;
  629. { .mmi
  630. (p8) ST [BO3] = f66, 1 * SIZE
  631. (p8) ST [B2 ] = f70, 1 * SIZE
  632. nop __LINE__
  633. }
  634. { .mmi
  635. (p9) LD f80 = [A1], TEMP3
  636. (p9) LD f81 = [A2], TEMP3
  637. nop __LINE__
  638. }
  639. ;;
  640. { .mmi
  641. (p8) ST [BO3] = f67, 5 * SIZE
  642. (p8) ST [B2 ] = f71, 5 * SIZE
  643. nop __LINE__
  644. }
  645. { .mmi
  646. (p9) LD f82 = [A1], TEMP3
  647. (p9) LD f83 = [A2], TEMP3
  648. nop __LINE__
  649. }
  650. ;;
  651. { .mmi
  652. (p8) ST [BO3] = f72, 1 * SIZE
  653. (p8) ST [B2 ] = f76, 1 * SIZE
  654. nop __LINE__
  655. }
  656. { .mmi
  657. (p9) LD f84 = [A1], TEMP3
  658. (p9) LD f85 = [A2], TEMP3
  659. nop __LINE__
  660. }
  661. ;;
  662. { .mmi
  663. (p8) ST [BO3] = f73, 1 * SIZE
  664. (p8) ST [B2 ] = f77, 1 * SIZE
  665. nop __LINE__
  666. }
  667. { .mmi
  668. (p9) LD f86 = [A1]
  669. (p9) LD f87 = [A2]
  670. nop __LINE__
  671. }
  672. ;;
  673. { .mmi
  674. (p8) ST [BO3] = f74, 1 * SIZE
  675. (p8) ST [B2 ] = f78, 1 * SIZE
  676. nop __LINE__
  677. }
  678. ;;
  679. { .mmi
  680. (p8) ST [BO3] = f75, 5 * SIZE
  681. (p8) ST [B2 ] = f79
  682. adds B2 = 4 * SIZE, BO4
  683. }
  684. ;;
  685. { .mmi
  686. (p9) ST [BO4] = f80, 1 * SIZE
  687. (p9) ST [B2 ] = f84, 1 * SIZE
  688. nop __LINE__
  689. }
  690. ;;
  691. { .mmi
  692. (p9) ST [BO4] = f81, 1 * SIZE
  693. (p9) ST [B2 ] = f85, 1 * SIZE
  694. nop __LINE__
  695. }
  696. ;;
  697. { .mmi
  698. (p9) ST [BO4] = f82, 1 * SIZE
  699. (p9) ST [B2 ] = f86, 1 * SIZE
  700. cmp.ne p8, p0 = 0, J
  701. }
  702. ;;
  703. { .mmb
  704. (p9) ST [BO4] = f83, 5 * SIZE
  705. (p9) ST [B2 ] = f87, 5 * SIZE
  706. (p8) br.cond.dptk .L11
  707. }
  708. ;;
  709. .align 32
  710. .L100:
  711. { .mmi
  712. mov A1 = A
  713. add I = 8, N
  714. mov pr.rot = 0
  715. }
  716. { .mmi
  717. adds A2 = 4 * SIZE, A
  718. tbit.z p6, p0 = M, 2
  719. }
  720. ;;
  721. { .mmi
  722. mov B1 = B
  723. adds B2 = 4 * SIZE, B
  724. mov ar.ec = 3
  725. }
  726. { .mib
  727. cmp.eq p16, p0 = r0, r0
  728. shr I = I, 4
  729. (p6) br.cond.dpnt .L200
  730. }
  731. ;;
  732. { .mmi
  733. cmp.eq p8, p0 = 0, I
  734. shladd I = I, 1, r0
  735. shladd A = LDA, 2, A
  736. }
  737. ;;
  738. { .mmi
  739. adds B = 32 * SIZE, B
  740. adds I = -1, I
  741. shr II = N, 3
  742. }
  743. ;;
  744. { .mmi
  745. mov LCOUNT = 0
  746. mov SCOUNT = 0
  747. mov ar.lc = I
  748. }
  749. { .mib
  750. nop __LINE__
  751. mov I = II
  752. (p8) br.cond.dpnt .L120
  753. }
  754. ;;
  755. .align 32
  756. .L112:
  757. { .mmi
  758. (p18) ST [B1] = f34, 1 * SIZE
  759. (p18) ST [B2] = f46, 1 * SIZE
  760. (p16) cmp.ne.unc p12, p0 = 1, I
  761. }
  762. { .mmi
  763. (p16) LD f32 = [A1], SIZE
  764. (p16) LD f44 = [A2], SIZE
  765. (p18) cmp.ne.unc p13, p0 = 1, II
  766. }
  767. ;;
  768. { .mmi
  769. (p18) ST [B1] = f37, 1 * SIZE
  770. (p18) ST [B2] = f49, 1 * SIZE
  771. nop __LINE__
  772. }
  773. { .mmi
  774. (p16) LD f35 = [A1], SIZE
  775. (p16) LD f47 = [A2], SIZE
  776. adds TEMP1 = -3 * SIZE, LDA
  777. }
  778. ;;
  779. { .mmi
  780. (p18) ST [B1] = f40, 1 * SIZE
  781. (p18) ST [B2] = f52, 1 * SIZE
  782. shladd TEMP3 = LDA, 2, r0
  783. }
  784. { .mmi
  785. (p16) LD f38 = [A1], SIZE
  786. (p16) LD f50 = [A2], SIZE
  787. (p12) mov TEMP1 = 5 * SIZE
  788. }
  789. ;;
  790. { .mmi
  791. (p18) ST [B1] = f43, 5 * SIZE
  792. (p18) ST [B2] = f55, 5 * SIZE
  793. (p16) adds LCOUNT = 1, LCOUNT
  794. }
  795. { .mmi
  796. (p16) LD f41 = [A1], TEMP1
  797. (p16) LD f53 = [A2], TEMP1
  798. (p18) adds SCOUNT = 1, SCOUNT
  799. }
  800. ;;
  801. { .mmi
  802. (p18) ST [B1] = f82, 1 * SIZE
  803. (p18) ST [B2] = f94, 1 * SIZE
  804. (p16) cmp.eq.unc p14, p0 = 2, LCOUNT
  805. }
  806. { .mmi
  807. (p12) LD f56 = [A1], SIZE
  808. (p12) LD f68 = [A2], SIZE
  809. (p18) cmp.eq.unc p15, p0 = 2, SCOUNT
  810. }
  811. ;;
  812. { .mmi
  813. (p18) ST [B1] = f85, 1 * SIZE
  814. (p18) ST [B2] = f97, 1 * SIZE
  815. mov TEMP2 = 5 * SIZE
  816. }
  817. { .mmi
  818. (p12) LD f59 = [A1], SIZE
  819. (p12) LD f71 = [A2], SIZE
  820. sub TEMP3 = LDA, TEMP3
  821. }
  822. ;;
  823. { .mmi
  824. (p18) ST [B1] = f88, 1 * SIZE
  825. (p18) ST [B2] = f100, 1 * SIZE
  826. (p13) adds TEMP2 = - 11 * SIZE, LDB
  827. }
  828. { .mmi
  829. (p12) LD f62 = [A1], SIZE
  830. (p12) LD f74 = [A2], SIZE
  831. (p12) adds TEMP1 = - 11 * SIZE, LDA
  832. }
  833. ;;
  834. { .mmi
  835. (p18) ST [B1] = f91
  836. (p18) ST [B2] = f103
  837. (p18) add B1 = B1, TEMP2
  838. }
  839. { .mmi
  840. (p12) LD f65 = [A1], TEMP1
  841. (p12) LD f77 = [A2], TEMP1
  842. (p18) add B2 = B2, TEMP2
  843. }
  844. ;;
  845. { .mmi
  846. (p13) ST [B1] = f58, 1 * SIZE
  847. (p13) ST [B2] = f70, 1 * SIZE
  848. adds TEMP3 = 5 * SIZE, TEMP3
  849. }
  850. { .mmi
  851. (p16) LD f80 = [A1], SIZE
  852. (p16) LD f92 = [A2], SIZE
  853. adds TEMP1 = -3 * SIZE, LDA
  854. }
  855. ;;
  856. { .mmi
  857. (p13) ST [B1] = f61, 1 * SIZE
  858. (p13) ST [B2] = f73, 1 * SIZE
  859. nop __LINE__
  860. }
  861. { .mmi
  862. (p16) LD f83 = [A1], SIZE
  863. (p16) LD f95 = [A2], SIZE
  864. (p14) mov TEMP1 = TEMP3
  865. }
  866. ;;
  867. { .mmi
  868. (p13) ST [B1] = f64, 1 * SIZE
  869. (p13) ST [B2] = f76, 1 * SIZE
  870. nop __LINE__
  871. }
  872. { .mmi
  873. (p16) LD f86 = [A1], SIZE
  874. (p16) LD f98 = [A2], SIZE
  875. (p12) mov TEMP1 = 5 * SIZE
  876. }
  877. ;;
  878. { .mmi
  879. (p13) ST [B1] = f67, 5 * SIZE
  880. (p13) ST [B2] = f79, 5 * SIZE
  881. (p14) mov LCOUNT = 0
  882. }
  883. { .mmi
  884. (p16) LD f89 = [A1], TEMP1
  885. (p16) LD f101 = [A2], TEMP1
  886. (p15) mov SCOUNT = 0
  887. }
  888. ;;
  889. { .mmi
  890. (p13) ST [B1] = f106, 1 * SIZE
  891. (p13) ST [B2] = f118, 1 * SIZE
  892. mov TEMP2 = 5 * SIZE
  893. }
  894. { .mmi
  895. (p12) LD f104 = [A1], SIZE
  896. (p12) LD f116 = [A2], SIZE
  897. nop __LINE__
  898. }
  899. ;;
  900. { .mmi
  901. (p13) ST [B1] = f109, 1 * SIZE
  902. (p13) ST [B2] = f121, 1 * SIZE
  903. sub TEMP2 = TEMP2, LDB
  904. }
  905. { .mmi
  906. (p12) LD f107 = [A1], SIZE
  907. (p12) LD f119 = [A2], SIZE
  908. adds TEMP1 = -11 * SIZE, LDA
  909. }
  910. ;;
  911. { .mmi
  912. (p13) ST [B1] = f112, 1 * SIZE
  913. (p13) ST [B2] = f124, 1 * SIZE
  914. (p15) adds TEMP2 = -27 * SIZE, LDB
  915. }
  916. { .mmi
  917. (p12) LD f110 = [A1], SIZE
  918. (p12) LD f122 = [A2], SIZE
  919. (p14) mov TEMP1 = TEMP3
  920. }
  921. ;;
  922. { .mmi
  923. (p13) ST [B1] = f115
  924. (p13) ST [B2] = f127
  925. (p13) add B1 = B1, TEMP2
  926. }
  927. { .mmi
  928. (p12) LD f113 = [A1], TEMP1
  929. (p12) LD f125 = [A2], TEMP1
  930. (p13) add B2 = B2, TEMP2
  931. }
  932. ;;
  933. { .mmb
  934. (p14) adds I = -2, I
  935. (p15) adds II = -2, II
  936. br.ctop.sptk .L112
  937. }
  938. ;;
  939. .align 32
  940. .L120:
  941. { .mmi
  942. add A2 = A1, LDA
  943. nop __LINE__
  944. tbit.nz p7, p0 = N, 2
  945. }
  946. ;;
  947. { .mmi
  948. (p7) LD f32 = [A1], SIZE
  949. (p7) LD f36 = [A2], SIZE
  950. tbit.nz p8, p0 = N, 1
  951. }
  952. ;;
  953. { .mmi
  954. (p7) LD f33 = [A1], SIZE
  955. (p7) LD f37 = [A2], SIZE
  956. adds TEMP1 = -3 * SIZE, LDA
  957. }
  958. ;;
  959. { .mmi
  960. (p7) LD f34 = [A1], SIZE
  961. (p7) LD f38 = [A2], SIZE
  962. add TEMP1 = TEMP1, LDA
  963. }
  964. ;;
  965. { .mmi
  966. (p7) LD f35 = [A1], TEMP1
  967. (p7) LD f39 = [A2], TEMP1
  968. tbit.nz p9, p0 = N, 0
  969. }
  970. ;;
  971. { .mmi
  972. (p7) LD f40 = [A1], SIZE
  973. (p7) LD f44 = [A2], SIZE
  974. mov TEMP2 = -1 * SIZE
  975. }
  976. ;;
  977. { .mmi
  978. (p7) LD f41 = [A1], SIZE
  979. (p7) LD f45 = [A2], SIZE
  980. shladd TEMP2 = LDA, 1, TEMP2
  981. }
  982. ;;
  983. { .mmi
  984. (p7) LD f42 = [A1], SIZE
  985. (p7) LD f46 = [A2], SIZE
  986. sub TEMP2 = 0, TEMP2
  987. }
  988. ;;
  989. { .mmi
  990. (p7) LD f43 = [A1], TEMP2
  991. (p7) LD f47 = [A2]
  992. nop __LINE__
  993. }
  994. ;;
  995. { .mmi
  996. add A2 = A1, LDA
  997. adds TEMP1 = -1 * SIZE, LDA
  998. mov TEMP2 = -1 * SIZE
  999. }
  1000. ;;
  1001. { .mmi
  1002. (p8) LD f48 = [A1], SIZE
  1003. (p8) LD f50 = [A2], SIZE
  1004. add TEMP1 = TEMP1, LDA
  1005. }
  1006. ;;
  1007. { .mmi
  1008. (p8) LD f49 = [A1], TEMP1
  1009. (p8) LD f51 = [A2], TEMP1
  1010. shladd TEMP2 = LDA, 1, TEMP2
  1011. }
  1012. ;;
  1013. { .mmi
  1014. (p8) LD f52 = [A1], SIZE
  1015. (p8) LD f54 = [A2], SIZE
  1016. sub TEMP2 = r0, TEMP2
  1017. }
  1018. ;;
  1019. { .mmi
  1020. (p8) LD f53 = [A1], TEMP2
  1021. (p8) LD f55 = [A2], TEMP2
  1022. nop __LINE__
  1023. }
  1024. ;;
  1025. { .mmi
  1026. add A2 = A1, LDA
  1027. adds B2 = 4 * SIZE, BO2
  1028. nop __LINE__
  1029. }
  1030. ;;
  1031. { .mmi
  1032. (p9) LD f56 = [A1]
  1033. nop __LINE__
  1034. (p9) shladd A1 = LDA, 1, A1
  1035. }
  1036. { .mmi
  1037. (p9) LD f57 = [A2]
  1038. nop __LINE__
  1039. (p9) shladd A2 = LDA, 1, A2
  1040. }
  1041. ;;
  1042. { .mmi
  1043. (p7) ST [BO2] = f32, 1 * SIZE
  1044. (p7) ST [B2 ] = f36, 1 * SIZE
  1045. nop __LINE__
  1046. }
  1047. { .mmi
  1048. (p9) LD f58 = [A1]
  1049. (p9) LD f59 = [A2]
  1050. nop __LINE__
  1051. }
  1052. ;;
  1053. ;;
  1054. { .mmi
  1055. (p7) ST [BO2] = f33, 1 * SIZE
  1056. (p7) ST [B2 ] = f37, 1 * SIZE
  1057. nop __LINE__
  1058. }
  1059. ;;
  1060. { .mmi
  1061. (p7) ST [BO2] = f34, 1 * SIZE
  1062. (p7) ST [B2 ] = f38, 1 * SIZE
  1063. nop __LINE__
  1064. }
  1065. ;;
  1066. { .mmi
  1067. (p7) ST [BO2] = f35, 5 * SIZE
  1068. (p7) ST [B2 ] = f39, 5 * SIZE
  1069. nop __LINE__
  1070. }
  1071. ;;
  1072. { .mmi
  1073. (p7) ST [BO2] = f40, 1 * SIZE
  1074. (p7) ST [B2 ] = f44, 1 * SIZE
  1075. nop __LINE__
  1076. }
  1077. ;;
  1078. { .mmi
  1079. (p7) ST [BO2] = f41, 1 * SIZE
  1080. (p7) ST [B2 ] = f45, 1 * SIZE
  1081. nop __LINE__
  1082. }
  1083. ;;
  1084. { .mmi
  1085. (p7) ST [BO2] = f42, 1 * SIZE
  1086. (p7) ST [B2 ] = f46, 1 * SIZE
  1087. nop __LINE__
  1088. }
  1089. ;;
  1090. { .mmi
  1091. (p7) ST [BO2] = f43, 5 * SIZE
  1092. (p7) ST [B2 ] = f47
  1093. adds B2 = 4 * SIZE, BO3
  1094. }
  1095. ;;
  1096. { .mmi
  1097. (p8) ST [BO3] = f48, 1 * SIZE
  1098. (p8) ST [B2 ] = f52, 1 * SIZE
  1099. nop __LINE__
  1100. }
  1101. ;;
  1102. { .mmi
  1103. (p8) ST [BO3] = f49, 1 * SIZE
  1104. (p8) ST [B2 ] = f53, 1 * SIZE
  1105. nop __LINE__
  1106. }
  1107. ;;
  1108. { .mmi
  1109. (p8) ST [BO3] = f50, 1 * SIZE
  1110. (p8) ST [B2 ] = f54, 1 * SIZE
  1111. nop __LINE__
  1112. }
  1113. ;;
  1114. { .mmi
  1115. (p8) ST [BO3] = f51, 5 * SIZE
  1116. (p8) ST [B2 ] = f55
  1117. adds B2 = 2 * SIZE, BO4
  1118. }
  1119. ;;
  1120. { .mmi
  1121. (p9) ST [BO4] = f56, 1 * SIZE
  1122. (p9) ST [B2 ] = f58, 1 * SIZE
  1123. nop __LINE__
  1124. }
  1125. ;;
  1126. { .mmi
  1127. (p9) ST [BO4] = f57, 3 * SIZE
  1128. (p9) ST [B2 ] = f59
  1129. nop __LINE__
  1130. }
  1131. ;;
  1132. .align 32
  1133. .L200:
  1134. { .mmi
  1135. add I = 8, N
  1136. mov A1 = A
  1137. mov pr.rot = 0
  1138. }
  1139. { .mmi
  1140. adds A2 = 4 * SIZE, A
  1141. nop __LINE__
  1142. tbit.z p6, p0 = M, 1
  1143. }
  1144. ;;
  1145. { .mmi
  1146. mov B1 = B
  1147. cmp.eq p16, p0 = r0, r0
  1148. mov ar.ec = 3
  1149. }
  1150. { .mib
  1151. adds B2 = 4 * SIZE, B
  1152. shr I = I, 4
  1153. (p6) br.cond.dpnt .L300
  1154. }
  1155. ;;
  1156. { .mmi
  1157. shladd A = LDA, 1, A
  1158. adds B = 16 * SIZE, B
  1159. shr II = N, 3
  1160. }
  1161. { .mmi
  1162. cmp.eq p8, p0 = 0, I
  1163. adds I = -1, I
  1164. nop __LINE__
  1165. }
  1166. ;;
  1167. { .mmi
  1168. nop __LINE__
  1169. nop __LINE__
  1170. mov ar.lc = I
  1171. }
  1172. { .mib
  1173. mov I = II
  1174. nop __LINE__
  1175. (p8) br.cond.dpnt .L220
  1176. }
  1177. ;;
  1178. .align 32
  1179. .L212:
  1180. { .mmi
  1181. (p18) ST [B1] = f34, 1 * SIZE
  1182. (p18) ST [B2] = f46, 1 * SIZE
  1183. (p16) cmp.ne.unc p12, p0 = 1, I
  1184. }
  1185. { .mmi
  1186. (p16) LD f32 = [A1], SIZE
  1187. (p16) LD f44 = [A2], SIZE
  1188. (p18) cmp.ne.unc p13, p0 = 1, II
  1189. }
  1190. ;;
  1191. { .mmi
  1192. (p18) ST [B1] = f37, 1 * SIZE
  1193. (p18) ST [B2] = f49, 1 * SIZE
  1194. adds TEMP1 = -3 * SIZE, LDA
  1195. }
  1196. { .mmi
  1197. (p16) LD f35 = [A1], SIZE
  1198. (p16) LD f47 = [A2], SIZE
  1199. nop __LINE__
  1200. }
  1201. ;;
  1202. { .mmi
  1203. (p18) ST [B1] = f40, 1 * SIZE
  1204. (p18) ST [B2] = f52, 1 * SIZE
  1205. (p12) mov TEMP1 = 5 * SIZE
  1206. }
  1207. { .mmi
  1208. (p16) LD f38 = [A1], SIZE
  1209. (p16) LD f50 = [A2], SIZE
  1210. nop __LINE__
  1211. }
  1212. ;;
  1213. { .mmi
  1214. (p18) ST [B1] = f43, 5 * SIZE
  1215. (p18) ST [B2] = f55, 5 * SIZE
  1216. nop __LINE__
  1217. }
  1218. { .mmi
  1219. (p16) LD f41 = [A1], TEMP1
  1220. (p16) LD f53 = [A2], TEMP1
  1221. nop __LINE__
  1222. }
  1223. ;;
  1224. { .mmi
  1225. (p18) ST [B1] = f82, 1 * SIZE
  1226. (p18) ST [B2] = f94, 1 * SIZE
  1227. nop __LINE__
  1228. }
  1229. { .mmi
  1230. (p12) LD f56 = [A1], SIZE
  1231. (p12) LD f68 = [A2], SIZE
  1232. nop __LINE__
  1233. }
  1234. ;;
  1235. { .mmi
  1236. (p18) ST [B1] = f85, 1 * SIZE
  1237. (p18) ST [B2] = f97, 1 * SIZE
  1238. mov TEMP2 = 5 * SIZE
  1239. }
  1240. { .mmi
  1241. (p12) LD f59 = [A1], SIZE
  1242. (p12) LD f71 = [A2], SIZE
  1243. nop __LINE__
  1244. }
  1245. ;;
  1246. { .mmi
  1247. (p18) ST [B1] = f88, 1 * SIZE
  1248. (p18) ST [B2] = f100, 1 * SIZE
  1249. (p13) adds TEMP2 = - 11 * SIZE, LDB
  1250. }
  1251. { .mmi
  1252. (p12) LD f62 = [A1], SIZE
  1253. (p12) LD f74 = [A2], SIZE
  1254. (p12) adds TEMP1 = - 11 * SIZE, LDA
  1255. }
  1256. ;;
  1257. { .mmi
  1258. (p18) ST [B1] = f91
  1259. (p18) ST [B2] = f103
  1260. (p18) add B1 = B1, TEMP2
  1261. }
  1262. { .mmi
  1263. (p12) LD f65 = [A1], TEMP1
  1264. (p12) LD f77 = [A2], TEMP1
  1265. (p18) add B2 = B2, TEMP2
  1266. }
  1267. ;;
  1268. { .mmi
  1269. (p13) ST [B1] = f58, 1 * SIZE
  1270. (p13) ST [B2] = f70, 1 * SIZE
  1271. nop __LINE__
  1272. }
  1273. { .mmi
  1274. (p16) LD f80 = [A1], SIZE
  1275. (p16) LD f92 = [A2], SIZE
  1276. sub TEMP1 = r0, LDA
  1277. }
  1278. ;;
  1279. { .mmi
  1280. (p13) ST [B1] = f61, 1 * SIZE
  1281. (p13) ST [B2] = f73, 1 * SIZE
  1282. nop __LINE__
  1283. }
  1284. { .mmi
  1285. (p16) LD f83 = [A1], SIZE
  1286. (p16) LD f95 = [A2], SIZE
  1287. (p16) adds TEMP1 = 5 * SIZE, TEMP1
  1288. }
  1289. ;;
  1290. { .mmi
  1291. (p13) ST [B1] = f64, 1 * SIZE
  1292. (p13) ST [B2] = f76, 1 * SIZE
  1293. nop __LINE__
  1294. }
  1295. { .mmi
  1296. (p16) LD f86 = [A1], SIZE
  1297. (p16) LD f98 = [A2], SIZE
  1298. (p12) mov TEMP1 = 5 * SIZE
  1299. }
  1300. ;;
  1301. { .mmi
  1302. (p13) ST [B1] = f67, 5 * SIZE
  1303. (p13) ST [B2] = f79, 5 * SIZE
  1304. nop __LINE__
  1305. }
  1306. { .mmi
  1307. (p16) LD f89 = [A1], TEMP1
  1308. (p16) LD f101 = [A2], TEMP1
  1309. adds TEMP1 = -11 * SIZE, LDA
  1310. }
  1311. ;;
  1312. { .mmi
  1313. (p13) ST [B1] = f106, 1 * SIZE
  1314. (p13) ST [B2] = f118, 1 * SIZE
  1315. mov TEMP2 = 5 * SIZE
  1316. }
  1317. { .mmi
  1318. (p12) LD f104 = [A1], SIZE
  1319. (p12) LD f116 = [A2], SIZE
  1320. (p16) shladd TEMP1 = LDA, 1, r0
  1321. }
  1322. ;;
  1323. { .mmi
  1324. (p13) ST [B1] = f109, 1 * SIZE
  1325. (p13) ST [B2] = f121, 1 * SIZE
  1326. sub TEMP2 = TEMP2, LDB
  1327. }
  1328. { .mmi
  1329. (p12) LD f107 = [A1], SIZE
  1330. (p12) LD f119 = [A2], SIZE
  1331. (p16) sub TEMP1 = LDA, TEMP1
  1332. }
  1333. ;;
  1334. { .mmi
  1335. (p13) ST [B1] = f112, 1 * SIZE
  1336. (p13) ST [B2] = f124, 1 * SIZE
  1337. (p18) adds TEMP2 = -11 * SIZE, LDB
  1338. }
  1339. { .mmi
  1340. (p12) LD f110 = [A1], SIZE
  1341. (p12) LD f122 = [A2], SIZE
  1342. (p16) adds TEMP1 = 5 * SIZE, TEMP1
  1343. }
  1344. ;;
  1345. { .mmi
  1346. (p13) ST [B1] = f115
  1347. (p13) ST [B2] = f127
  1348. (p13) add B1 = B1, TEMP2
  1349. }
  1350. { .mmi
  1351. (p12) LD f113 = [A1], TEMP1
  1352. (p12) LD f125 = [A2], TEMP1
  1353. (p13) add B2 = B2, TEMP2
  1354. }
  1355. ;;
  1356. { .mmb
  1357. (p16) adds I = -2, I
  1358. (p18) adds II = -2, II
  1359. br.ctop.sptk .L212
  1360. }
  1361. ;;
  1362. .align 32
  1363. .L220:
  1364. { .mmi
  1365. add A2 = A1, LDA
  1366. nop __LINE__
  1367. tbit.nz p7, p0 = N, 2
  1368. }
  1369. ;;
  1370. { .mmi
  1371. (p7) LD f32 = [A1], SIZE
  1372. (p7) LD f36 = [A2], SIZE
  1373. tbit.nz p8, p0 = N, 1
  1374. }
  1375. ;;
  1376. { .mmi
  1377. (p7) LD f33 = [A1], SIZE
  1378. (p7) LD f37 = [A2], SIZE
  1379. tbit.nz p9, p0 = N, 0
  1380. }
  1381. ;;
  1382. { .mmi
  1383. (p7) LD f34 = [A1], SIZE
  1384. (p7) LD f38 = [A2], SIZE
  1385. nop __LINE__
  1386. }
  1387. ;;
  1388. { .mmi
  1389. (p7) LD f35 = [A1], SIZE
  1390. (p7) LD f39 = [A2]
  1391. nop __LINE__
  1392. }
  1393. ;;
  1394. { .mmi
  1395. add A2 = A1, LDA
  1396. nop __LINE__
  1397. nop __LINE__
  1398. }
  1399. ;;
  1400. { .mmi
  1401. (p8) LD f40 = [A1], SIZE
  1402. (p8) LD f42 = [A2], SIZE
  1403. nop __LINE__
  1404. }
  1405. ;;
  1406. { .mmi
  1407. (p8) LD f41 = [A1], SIZE
  1408. (p8) LD f43 = [A2]
  1409. nop __LINE__
  1410. }
  1411. ;;
  1412. { .mmi
  1413. add A2 = A1, LDA
  1414. nop __LINE__
  1415. nop __LINE__
  1416. }
  1417. ;;
  1418. { .mmi
  1419. (p9) LD f44 = [A1]
  1420. (p9) LD f45 = [A2]
  1421. adds B2 = 4 * SIZE, BO2
  1422. }
  1423. ;;
  1424. { .mmi
  1425. (p7) ST [BO2] = f32, 1 * SIZE
  1426. (p7) ST [B2 ] = f36, 1 * SIZE
  1427. nop __LINE__
  1428. }
  1429. ;;
  1430. { .mmi
  1431. (p7) ST [BO2] = f33, 1 * SIZE
  1432. (p7) ST [B2 ] = f37, 1 * SIZE
  1433. nop __LINE__
  1434. }
  1435. ;;
  1436. { .mmi
  1437. (p7) ST [BO2] = f34, 1 * SIZE
  1438. (p7) ST [B2 ] = f38, 1 * SIZE
  1439. nop __LINE__
  1440. }
  1441. ;;
  1442. { .mmi
  1443. (p7) ST [BO2] = f35, 5 * SIZE
  1444. (p7) ST [B2 ] = f39
  1445. adds B2 = 2 * SIZE, BO3
  1446. }
  1447. ;;
  1448. { .mmi
  1449. (p8) ST [BO3] = f40, 1 * SIZE
  1450. (p8) ST [B2 ] = f42, 1 * SIZE
  1451. nop __LINE__
  1452. }
  1453. ;;
  1454. { .mmi
  1455. (p8) ST [BO3] = f41, 3 * SIZE
  1456. (p8) ST [B2 ] = f43
  1457. adds B2 = 1 * SIZE, BO4
  1458. }
  1459. ;;
  1460. { .mmi
  1461. (p9) ST [BO4] = f44, 2 * SIZE
  1462. (p9) ST [B2 ] = f45
  1463. nop __LINE__
  1464. }
  1465. ;;
  1466. .align 32
  1467. .L300:
  1468. { .mmi
  1469. add I = 8, N
  1470. mov A1 = A
  1471. mov pr.rot = 0
  1472. }
  1473. { .mmi
  1474. mov B1 = B
  1475. adds A2 = 4 * SIZE, A
  1476. tbit.z p6, p0 = M, 0
  1477. }
  1478. ;;
  1479. { .mmi
  1480. adds B2 = 4 * SIZE, B
  1481. cmp.eq p16, p0 = r0, r0
  1482. mov ar.ec = 3
  1483. }
  1484. { .mib
  1485. nop __LINE__
  1486. shr I = I, 4
  1487. (p6) br.cond.dpnt .L999
  1488. }
  1489. ;;
  1490. { .mmi
  1491. cmp.eq p8, p0 = 0, I
  1492. adds I = -1, I
  1493. shr II = N, 3
  1494. }
  1495. ;;
  1496. { .mmi
  1497. nop __LINE__
  1498. nop __LINE__
  1499. mov ar.lc = I
  1500. }
  1501. { .mib
  1502. nop __LINE__
  1503. mov I = II
  1504. (p8) br.cond.dpnt .L320
  1505. }
  1506. ;;
  1507. .align 32
  1508. .L312:
  1509. { .mmi
  1510. (p18) ST [B1] = f34, 1 * SIZE
  1511. (p18) ST [B2] = f46, 1 * SIZE
  1512. (p16) cmp.ne.unc p12, p0 = 1, I
  1513. }
  1514. { .mmi
  1515. (p16) LD f32 = [A1], SIZE
  1516. (p16) LD f44 = [A2], SIZE
  1517. (p18) cmp.ne.unc p13, p0 = 1, II
  1518. }
  1519. ;;
  1520. { .mmi
  1521. (p18) ST [B1] = f37, 1 * SIZE
  1522. (p18) ST [B2] = f49, 1 * SIZE
  1523. adds TEMP2 = - 3 * SIZE, LDB
  1524. }
  1525. { .mmi
  1526. (p16) LD f35 = [A1], SIZE
  1527. (p16) LD f47 = [A2], SIZE
  1528. nop __LINE__
  1529. }
  1530. ;;
  1531. { .mmi
  1532. (p18) ST [B1] = f40, 1 * SIZE
  1533. (p18) ST [B2] = f52, 1 * SIZE
  1534. nop __LINE__
  1535. }
  1536. { .mmi
  1537. (p16) LD f38 = [A1], SIZE
  1538. (p16) LD f50 = [A2], SIZE
  1539. nop __LINE__
  1540. }
  1541. ;;
  1542. { .mmi
  1543. (p18) ST [B1] = f43
  1544. (p18) ST [B2] = f55
  1545. (p18) add B1 = B1, TEMP2
  1546. }
  1547. { .mmi
  1548. (p16) LD f41 = [A1], 5 * SIZE
  1549. (p16) LD f53 = [A2], 5 * SIZE
  1550. (p18) add B2 = B2, TEMP2
  1551. }
  1552. ;;
  1553. { .mmi
  1554. (p13) ST [B1] = f58, 1 * SIZE
  1555. (p13) ST [B2] = f70, 1 * SIZE
  1556. (p16) adds I = -2, I
  1557. }
  1558. { .mmi
  1559. (p12) LD f56 = [A1], SIZE
  1560. (p12) LD f68 = [A2], SIZE
  1561. (p18) adds II = -2, II
  1562. }
  1563. ;;
  1564. { .mmi
  1565. (p13) ST [B1] = f61, 1 * SIZE
  1566. (p13) ST [B2] = f73, 1 * SIZE
  1567. nop __LINE__
  1568. }
  1569. { .mmi
  1570. (p12) LD f59 = [A1], SIZE
  1571. (p12) LD f71 = [A2], SIZE
  1572. nop __LINE__
  1573. }
  1574. ;;
  1575. { .mmi
  1576. (p13) ST [B1] = f64, 1 * SIZE
  1577. (p13) ST [B2] = f76, 1 * SIZE
  1578. nop __LINE__
  1579. }
  1580. { .mmi
  1581. (p12) LD f62 = [A1], SIZE
  1582. (p12) LD f74 = [A2], SIZE
  1583. nop __LINE__
  1584. }
  1585. ;;
  1586. { .mmi
  1587. (p13) ST [B1] = f67
  1588. (p13) ST [B2] = f79
  1589. (p13) add B1 = B1, TEMP2
  1590. }
  1591. { .mmi
  1592. (p12) LD f65 = [A1], 5 * SIZE
  1593. (p12) LD f77 = [A2], 5 * SIZE
  1594. (p13) add B2 = B2, TEMP2
  1595. }
  1596. ;;
  1597. { .mmb
  1598. nop __LINE__
  1599. nop __LINE__
  1600. br.ctop.sptk .L312
  1601. }
  1602. ;;
  1603. .align 32
  1604. .L320:
  1605. { .mmi
  1606. adds A2 = 2 * SIZE, A1
  1607. adds B2 = 2 * SIZE, BO2
  1608. tbit.nz p7, p0 = N, 2
  1609. }
  1610. ;;
  1611. { .mmi
  1612. (p7) LD f32 = [A1], SIZE
  1613. (p7) LD f34 = [A2], SIZE
  1614. tbit.nz p8, p0 = N, 1
  1615. }
  1616. ;;
  1617. { .mmi
  1618. (p7) LD f33 = [A1], 3 * SIZE
  1619. (p7) LD f35 = [A2]
  1620. nop __LINE__
  1621. }
  1622. ;;
  1623. { .mmi
  1624. adds A2 = SIZE, A1
  1625. nop __LINE__
  1626. nop __LINE__
  1627. }
  1628. ;;
  1629. { .mmi
  1630. (p8) LD f36 = [A1], 2 * SIZE
  1631. (p8) LD f37 = [A2]
  1632. tbit.nz p9, p0 = N, 0
  1633. }
  1634. ;;
  1635. { .mmi
  1636. (p9) LD f38 = [A1]
  1637. nop __LINE__
  1638. nop __LINE__
  1639. }
  1640. ;;
  1641. { .mmi
  1642. (p7) ST [BO2] = f32, 1 * SIZE
  1643. (p7) ST [B2 ] = f34, 1 * SIZE
  1644. nop __LINE__
  1645. }
  1646. ;;
  1647. { .mmi
  1648. (p7) ST [BO2] = f33, 3 * SIZE
  1649. (p7) ST [B2 ] = f35
  1650. adds B2 = SIZE, BO3
  1651. }
  1652. ;;
  1653. { .mmi
  1654. (p8) ST [BO3] = f36, 2 * SIZE
  1655. (p8) ST [B2 ] = f37
  1656. nop __LINE__
  1657. }
  1658. ;;
  1659. { .mmi
  1660. (p9) ST [BO4] = f38, 1 * SIZE
  1661. nop __LINE__
  1662. nop __LINE__
  1663. }
  1664. ;;
  1665. .align 32
  1666. .L999:
  1667. mov pr = PR, -1
  1668. mov ar.lc = ARLC
  1669. br.ret.sptk.many b0
  1670. EPILOGUE