You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_ncopy.S 9.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE 64
  41. #define WPREFETCHSIZE 32
  42. #ifndef XDOUBLE
  43. #define LD LDF8
  44. #define ST STF8_NTA
  45. #else
  46. #define LD LDFD
  47. #define ST STFD_NTA
  48. #endif
  49. #define J r15
  50. #define PREB r17
  51. #define PREA r18
  52. #define A1 r19
  53. #define A2 r20
  54. #define A3 r21
  55. #define A4 r22
  56. #define A5 r23
  57. #define A6 r24
  58. #define A7 r25
  59. #define A8 r26
  60. #define B1 r27
  61. #define B2 r28
  62. #define COUNT r9
  63. #define I r10
  64. #define II r11
  65. #define ARLC r29
  66. #define PR r30
  67. #define M r32
  68. #define N r33
  69. #define A r34
  70. #define LDA r35
  71. #define B r36
  72. PROLOGUE
  73. .prologue
  74. PROFCODE
  75. .body
  76. { .mii
  77. shladd LDA = LDA, BASE_SHIFT, r0
  78. mov PR = pr
  79. shr J = N, 3
  80. }
  81. ;;
  82. { .mib
  83. cmp.eq p8, p0 = 0, J
  84. mov ARLC = ar.lc
  85. (p8) br.cond.dpnt .L20
  86. }
  87. ;;
  88. .align 32
  89. .L11:
  90. { .mmi
  91. mov A1 = A
  92. add A2 = A, LDA
  93. mov pr.rot = 0
  94. }
  95. { .mmi
  96. shladd A3 = LDA, 1, A
  97. shladd A5 = LDA, 2, A
  98. adds I = 1, M
  99. }
  100. ;;
  101. { .mmi
  102. shladd A4 = LDA, 1, A2
  103. shladd A6 = LDA, 2, A2
  104. mov ar.ec = 6
  105. }
  106. { .mmi
  107. cmp.eq p16, p0 = r0, r0
  108. shladd A7 = LDA, 2, A3
  109. shr I = I, 1
  110. }
  111. ;;
  112. { .mmi
  113. adds B1 = 8 * SIZE, B
  114. shladd A8 = LDA, 2, A4
  115. shladd A = LDA, 3, A
  116. }
  117. { .mmi
  118. adds I = -1, I
  119. mov COUNT = 0
  120. adds J = -1, J
  121. }
  122. ;;
  123. { .mmi
  124. adds PREA = PREFETCHSIZE * SIZE, A
  125. adds PREB = WPREFETCHSIZE * SIZE, B
  126. mov ar.lc = I
  127. }
  128. { .mmi
  129. mov I = M
  130. mov II = M
  131. cmp.ne p14, p0 = r0, r0
  132. }
  133. ;;
  134. .align 32
  135. .L12:
  136. { .mmi
  137. (p21) ST [B ] = f37, 1 * SIZE
  138. (p14) ST [B1] = f49, 1 * SIZE
  139. (p16) cmp.ne.unc p13, p0 = 1, I
  140. }
  141. { .mmi
  142. lfetch.nt1 [PREA], LDA
  143. lfetch.excl.nt1 [PREB]
  144. adds PREB = 16 * SIZE, PREB
  145. }
  146. ;;
  147. { .mmi
  148. (p21) ST [B ] = f43, 1 * SIZE
  149. (p14) ST [B1] = f55, 1 * SIZE
  150. cmp.eq p9, p0 = 8, COUNT
  151. }
  152. { .mmi
  153. (p16) LD f32 = [A1], SIZE
  154. (p16) LD f38 = [A2], SIZE
  155. (p16) adds I = -2, I
  156. }
  157. ;;
  158. { .mmi
  159. (p21) ST [B ] = f61, 1 * SIZE
  160. (p14) ST [B1] = f73, 1 * SIZE
  161. (p9) mov COUNT = 0
  162. }
  163. { .mmi
  164. (p13) LD f44 = [A1], SIZE
  165. (p13) LD f50 = [A2], SIZE
  166. (p21) adds II = -2, II
  167. }
  168. ;;
  169. { .mmb
  170. (p21) ST [B ] = f67, 1 * SIZE
  171. (p14) ST [B1] = f79, 1 * SIZE
  172. nop __LINE__
  173. }
  174. { .mmb
  175. (p16) LD f56 = [A3], SIZE
  176. (p16) LD f62 = [A4], SIZE
  177. nop __LINE__
  178. }
  179. ;;
  180. { .mmi
  181. (p21) ST [B ] = f85, 1 * SIZE
  182. (p14) ST [B1] = f97, 1 * SIZE
  183. (p9) adds PREA = (PREFETCHSIZE - 2)* SIZE, A1
  184. }
  185. { .mmb
  186. (p13) LD f68 = [A3], SIZE
  187. (p13) LD f74 = [A4], SIZE
  188. nop __LINE__
  189. }
  190. ;;
  191. { .mmb
  192. (p21) ST [B ] = f91, 1 * SIZE
  193. (p14) ST [B1] = f103, 1 * SIZE
  194. nop __LINE__
  195. }
  196. { .mmb
  197. (p16) LD f80 = [A5], SIZE
  198. (p16) LD f86 = [A6], SIZE
  199. nop __LINE__
  200. }
  201. ;;
  202. { .mmb
  203. (p21) ST [B ] = f109, 1 * SIZE
  204. (p14) ST [B1] = f121, 1 * SIZE
  205. nop __LINE__
  206. }
  207. { .mmb
  208. (p13) LD f92 = [A5], SIZE
  209. (p13) LD f98 = [A6], SIZE
  210. nop __LINE__
  211. }
  212. ;;
  213. { .mmi
  214. (p21) ST [B ] = f115, 1 * SIZE
  215. (p14) ST [B1] = f127, 9 * SIZE
  216. (p16) adds COUNT = 1, COUNT
  217. }
  218. { .mmb
  219. (p16) LD f104 = [A7], SIZE
  220. (p16) LD f110 = [A8], SIZE
  221. nop __LINE__
  222. }
  223. ;;
  224. { .mmi
  225. (p13) LD f116 = [A7], SIZE
  226. (p13) LD f122 = [A8], SIZE
  227. (p14) adds B = 8 * SIZE, B
  228. }
  229. { .mmb
  230. (p20) cmp.ne.unc p14, p0 = 1, II
  231. nop __LINE__
  232. br.ctop.sptk.few .L12
  233. }
  234. ;;
  235. { .mmb
  236. cmp.ne p6, p0 = 0, J
  237. nop __LINE__
  238. (p6) br.cond.dptk .L11
  239. }
  240. ;;
  241. .align 32
  242. .L20:
  243. { .mmi
  244. adds I = 1, M
  245. mov A1 = A
  246. mov pr.rot = 0
  247. }
  248. { .mmi
  249. add A2 = A, LDA
  250. shladd A3 = LDA, 1, A
  251. tbit.z p6, p0 = N, 2
  252. }
  253. ;;
  254. { .mmi
  255. shladd A4 = LDA, 1, A2
  256. adds B1 = 4 * SIZE, B
  257. mov ar.ec = 6
  258. }
  259. { .mib
  260. cmp.eq p16, p0 = r0, r0
  261. shr I = I, 1
  262. (p6) br.cond.dpnt .L30
  263. }
  264. ;;
  265. { .mmi
  266. shladd A = LDA, 2, A
  267. nop __LINE__
  268. nop __LINE__
  269. }
  270. { .mmi
  271. adds I = -1, I
  272. mov COUNT = 0
  273. adds J = -1, J
  274. }
  275. ;;
  276. { .mmi
  277. adds PREA = PREFETCHSIZE * SIZE, A
  278. adds PREB = WPREFETCHSIZE * SIZE, B
  279. mov ar.lc = I
  280. }
  281. { .mmi
  282. mov I = M
  283. mov II = M
  284. cmp.ne p14, p0 = r0, r0
  285. }
  286. ;;
  287. .align 32
  288. .L22:
  289. { .mmi
  290. (p21) ST [B ] = f37, 1 * SIZE
  291. (p14) ST [B1] = f49, 1 * SIZE
  292. (p16) cmp.ne.unc p13, p0 = 1, I
  293. }
  294. { .mmi
  295. lfetch.nt1 [PREA], LDA
  296. lfetch.excl.nt1 [PREB], 8 * SIZE
  297. cmp.eq p9, p0 = 4, COUNT
  298. }
  299. ;;
  300. { .mmi
  301. (p21) ST [B ] = f43, 1 * SIZE
  302. (p14) ST [B1] = f55, 1 * SIZE
  303. (p16) adds I = -2, I
  304. }
  305. { .mmi
  306. (p16) LD f32 = [A1], SIZE
  307. (p16) LD f38 = [A2], SIZE
  308. (p21) adds II = -2, II
  309. }
  310. ;;
  311. { .mmi
  312. (p21) ST [B ] = f61, 1 * SIZE
  313. (p14) ST [B1] = f73, 1 * SIZE
  314. (p9) mov COUNT = 0
  315. }
  316. { .mmi
  317. (p13) LD f44 = [A1], SIZE
  318. (p13) LD f50 = [A2], SIZE
  319. nop __LINE__
  320. }
  321. ;;
  322. { .mmi
  323. (p21) ST [B ] = f67, 1 * SIZE
  324. (p14) ST [B1] = f79, 5 * SIZE
  325. (p9) adds PREA = PREFETCHSIZE * SIZE, A1
  326. }
  327. { .mmb
  328. (p16) LD f56 = [A3], SIZE
  329. (p16) LD f62 = [A4], SIZE
  330. nop __LINE__
  331. }
  332. ;;
  333. { .mmi
  334. (p13) LD f68 = [A3], SIZE
  335. (p13) LD f74 = [A4], SIZE
  336. (p16) adds COUNT = 1, COUNT
  337. }
  338. { .mmb
  339. (p14) adds B = 4 * SIZE, B
  340. (p20) cmp.ne.unc p14, p0 = 1, II
  341. br.ctop.sptk.few .L22
  342. }
  343. ;;
  344. .align 32
  345. .L30:
  346. { .mmi
  347. adds I = 1, M
  348. mov A1 = A
  349. mov pr.rot = 0
  350. }
  351. { .mmi
  352. add A2 = A, LDA
  353. adds B1 = 2 * SIZE, B
  354. tbit.z p6, p0 = N, 1
  355. }
  356. ;;
  357. { .mmi
  358. nop __LINE__
  359. nop __LINE__
  360. mov ar.ec = 6
  361. }
  362. { .mib
  363. cmp.eq p16, p0 = r0, r0
  364. shr I = I, 1
  365. (p6) br.cond.dpnt .L40
  366. }
  367. ;;
  368. { .mmi
  369. adds I = -1, I
  370. ;;
  371. shladd A = LDA, 1, A
  372. mov ar.lc = I
  373. }
  374. { .mmi
  375. mov I = M
  376. mov II = M
  377. cmp.ne p14, p0 = r0, r0
  378. }
  379. ;;
  380. .align 32
  381. .L32:
  382. { .mmi
  383. (p21) ST [B ] = f37, 1 * SIZE
  384. (p14) ST [B1] = f49, 1 * SIZE
  385. (p16) cmp.ne.unc p13, p0 = 1, I
  386. }
  387. { .mmi
  388. nop __LINE__
  389. nop __LINE__
  390. (p21) adds II = -2, II
  391. }
  392. ;;
  393. { .mmi
  394. (p21) ST [B ] = f43, 1 * SIZE
  395. (p14) ST [B1] = f55, 3 * SIZE
  396. nop __LINE__
  397. }
  398. { .mmi
  399. (p16) LD f32 = [A1], SIZE
  400. (p16) LD f38 = [A2], SIZE
  401. nop __LINE__
  402. }
  403. ;;
  404. { .mmi
  405. (p13) LD f44 = [A1], SIZE
  406. (p13) LD f50 = [A2], SIZE
  407. (p16) adds I = -2, I
  408. }
  409. { .mmb
  410. (p14) adds B = 2 * SIZE, B
  411. (p20) cmp.ne.unc p14, p0 = 1, II
  412. br.ctop.sptk.few .L32
  413. }
  414. ;;
  415. .align 32
  416. .L40:
  417. { .mmi
  418. adds I = 1, M
  419. mov A1 = A
  420. mov pr.rot = 0
  421. }
  422. { .mmi
  423. tbit.z p6, p0 = N, 0
  424. }
  425. ;;
  426. { .mmi
  427. nop __LINE__
  428. nop __LINE__
  429. mov ar.ec = 6
  430. }
  431. { .mib
  432. cmp.eq p16, p0 = r0, r0
  433. shr I = I, 1
  434. (p6) br.cond.dpnt .L999
  435. }
  436. ;;
  437. { .mmi
  438. adds I = -1, I
  439. ;;
  440. mov ar.lc = I
  441. }
  442. { .mmi
  443. mov I = M
  444. mov II = M
  445. cmp.ne p14, p0 = r0, r0
  446. }
  447. ;;
  448. .align 32
  449. .L42:
  450. { .mmi
  451. (p21) ST [B ] = f37, 1 * SIZE
  452. (p16) cmp.ne.unc p13, p0 = 1, I
  453. (p21) adds II = -2, II
  454. }
  455. ;;
  456. { .mmi
  457. (p14) ST [B ] = f49, 1 * SIZE
  458. (p16) LD f32 = [A1], SIZE
  459. (p16) adds I = -2, I
  460. }
  461. ;;
  462. { .mmb
  463. (p13) LD f44 = [A1], SIZE
  464. (p20) cmp.ne.unc p14, p0 = 1, II
  465. br.ctop.sptk.few .L42
  466. }
  467. ;;
  468. .align 32
  469. .L999:
  470. mov pr = PR, -1
  471. mov ar.lc = ARLC
  472. br.ret.sptk.many b0
  473. EPILOGUE