You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_beta.S 9.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE 140
  41. #define CO1 r14
  42. #define CO2 r15
  43. #define CO3 r16
  44. #define DO1 r17
  45. #define DO2 r18
  46. #define DO3 r19
  47. #define I r22
  48. #define I_AND_15 r23
  49. #define PRE1 r24
  50. #define PR r30
  51. #define ARLC r31
  52. #define M r32
  53. #define N r33
  54. #define C r34
  55. #define LDC r35
  56. #define J r36
  57. #define BETA f8
  58. PROLOGUE
  59. .prologue
  60. PROFCODE
  61. { .mmi
  62. #ifndef XDOUBLE
  63. adds CO1 = 16, r12
  64. adds CO2 = 24, r12
  65. #else
  66. adds CO1 = 32, r12
  67. adds CO2 = 40, r12
  68. #endif
  69. .save ar.lc, ARLC
  70. mov ARLC = ar.lc
  71. }
  72. { .mfb
  73. cmp.ge p6, p0 = 0, N
  74. fcmp.eq p0, p15 = BETA, f0
  75. (p6) br.ret.sptk.many b0
  76. }
  77. ;;
  78. .body
  79. { .mmi
  80. ld8 C = [CO1], 8
  81. ld8 LDC = [CO2]
  82. mov PR = pr
  83. }
  84. { .mmi
  85. mov J = N
  86. shr I = M, 4
  87. }
  88. ;;
  89. { .mmb
  90. shladd LDC = LDC, BASE_SHIFT, r0
  91. adds I = -1, I
  92. (p15) br.cond.dpnt .L100 // if (beta != 0) goto L100
  93. }
  94. ;;
  95. .align 32
  96. .L60:
  97. { .mmi
  98. mov CO1 = C
  99. mov CO3 = C
  100. add CO2 = 4 * SIZE, C
  101. }
  102. { .mmi
  103. adds PRE1 = PREFETCHSIZE * SIZE, C
  104. add C = C, LDC
  105. tbit.nz p12, p0 = M, 3
  106. }
  107. ;;
  108. { .mmi
  109. and I_AND_15 = 15, M
  110. mov ar.lc = I
  111. }
  112. { .mib
  113. cmp.gt p8, p0 = 0, I
  114. (p8) br.cond.dpnt .L80
  115. }
  116. ;;
  117. .align 32
  118. .L70:
  119. { .mmi
  120. STFD [CO1] = f0, 1 * SIZE
  121. STFD [CO2] = f0, 1 * SIZE
  122. }
  123. { .mmi
  124. lfetch.excl.nt1 [PRE1]
  125. nop.m 0
  126. adds PRE1 = 16 * SIZE, PRE1
  127. }
  128. ;;
  129. { .mmi
  130. STFD [CO1] = f0, 1 * SIZE
  131. STFD [CO2] = f0, 1 * SIZE
  132. adds CO3 = 16 * SIZE, CO3
  133. }
  134. ;;
  135. { .mmi
  136. STFD [CO1] = f0, 1 * SIZE
  137. STFD [CO2] = f0, 1 * SIZE
  138. }
  139. ;;
  140. { .mmi
  141. STFD [CO1] = f0, 5 * SIZE
  142. STFD [CO2] = f0, 5 * SIZE
  143. }
  144. ;;
  145. { .mmi
  146. STFD [CO1] = f0, 1 * SIZE
  147. STFD [CO2] = f0, 1 * SIZE
  148. }
  149. ;;
  150. { .mmi
  151. STFD [CO1] = f0, 1 * SIZE
  152. STFD [CO2] = f0, 1 * SIZE
  153. }
  154. ;;
  155. { .mmi
  156. STFD [CO1] = f0, 1 * SIZE
  157. STFD [CO2] = f0, 1 * SIZE
  158. }
  159. ;;
  160. { .mmb
  161. STFD [CO1] = f0, 5 * SIZE
  162. STFD [CO2] = f0, 5 * SIZE
  163. br.cloop.sptk.few .L70
  164. }
  165. ;;
  166. .align 32
  167. .L80:
  168. { .mmi
  169. (p12) STFD [CO1] = f0, 1 * SIZE
  170. (p12) STFD [CO2] = f0, 1 * SIZE
  171. tbit.nz p13, p0 = M, 2
  172. }
  173. { .mmb
  174. cmp.eq p9, p0 = 0, I_AND_15
  175. adds J = -1, J
  176. (p9) br.cond.dptk .L99
  177. }
  178. ;;
  179. { .mmi
  180. (p12) STFD [CO1] = f0, 1 * SIZE
  181. (p12) STFD [CO2] = f0, 1 * SIZE
  182. tbit.nz p14, p0 = M, 1
  183. }
  184. ;;
  185. { .mmi
  186. (p12) STFD [CO1] = f0, 1 * SIZE
  187. (p12) STFD [CO2] = f0, 1 * SIZE
  188. (p12) adds CO3 = 8 * SIZE, CO3
  189. }
  190. ;;
  191. { .mmi
  192. (p12) STFD [CO1] = f0, 5 * SIZE
  193. (p12) STFD [CO2] = f0
  194. (p13) adds CO3 = 4 * SIZE, CO3
  195. }
  196. ;;
  197. { .mmi
  198. (p13) STFD [CO1] = f0, 1 * SIZE
  199. (p14) STFD [CO3] = f0, 1 * SIZE
  200. }
  201. ;;
  202. { .mmi
  203. (p13) STFD [CO1] = f0, 1 * SIZE
  204. (p14) STFD [CO3] = f0, 1 * SIZE
  205. tbit.nz p15, p0 = M, 0
  206. }
  207. ;;
  208. { .mmi
  209. (p13) STFD [CO1] = f0, 1 * SIZE
  210. (p15) STFD [CO3] = f0
  211. }
  212. ;;
  213. { .mmi
  214. (p13) STFD [CO1] = f0
  215. }
  216. ;;
  217. .align 32
  218. .L99:
  219. { .mib
  220. cmp.lt p6, p0 = 0, J
  221. mov ar.lc = ARLC
  222. }
  223. { .mbb
  224. (p6) br.cond.dptk .L60
  225. br.ret.sptk.many b0
  226. }
  227. ;;
  228. .align 32
  229. .L100:
  230. { .mmi
  231. mov CO1 = C
  232. mov CO3 = C
  233. mov pr.rot = 0
  234. }
  235. { .mmi
  236. adds PRE1 = PREFETCHSIZE * SIZE, C
  237. add CO2 = 4 * SIZE, C
  238. mov DO1 = C
  239. }
  240. ;;
  241. { .mmi
  242. mov ar.ec = 6
  243. }
  244. { .mmi
  245. adds DO2 = 4 * SIZE, C
  246. mov DO3 = C
  247. add C = C, LDC
  248. }
  249. ;;
  250. { .mmi
  251. and I_AND_15 = 15, M
  252. cmp.eq p16, p0 = r0, r0
  253. mov ar.lc = I
  254. }
  255. { .mib
  256. cmp.gt p8, p0 = 0, I
  257. tbit.nz p12, p0 = M, 3
  258. (p8) br.cond.dpnt .L180
  259. }
  260. ;;
  261. .align 32
  262. .L170:
  263. { .mmf
  264. (p21) STFD [DO1] = f6, 1 * SIZE
  265. (p21) STFD [DO2] = f7, 1 * SIZE
  266. (p21) FMPY f6 = BETA, f85
  267. }
  268. { .mmf
  269. (p16) lfetch.excl.nt1 [PRE1]
  270. (p16) adds CO3 = 16 * SIZE, CO3
  271. (p21) FMPY f7 = BETA, f91
  272. }
  273. ;;
  274. { .mmf
  275. (p21) STFD [DO1] = f10, 1 * SIZE
  276. (p21) STFD [DO2] = f11, 1 * SIZE
  277. (p21) FMPY f10 = BETA, f97
  278. }
  279. { .mmf
  280. (p16) LDFD f32 = [CO1], 1 * SIZE
  281. (p16) LDFD f38 = [CO2], 1 * SIZE
  282. (p21) FMPY f11 = BETA, f103
  283. }
  284. ;;
  285. { .mmf
  286. (p21) STFD [DO1] = f12, 1 * SIZE
  287. (p21) STFD [DO2] = f13, 1 * SIZE
  288. (p21) FMPY f12 = BETA, f109
  289. }
  290. { .mmf
  291. (p16) LDFD f44 = [CO1], 1 * SIZE
  292. (p16) LDFD f50 = [CO2], 1 * SIZE
  293. (p21) FMPY f13 = BETA, f115
  294. }
  295. ;;
  296. { .mmf
  297. (p21) STFD [DO1] = f14, 5 * SIZE
  298. (p21) STFD [DO2] = f15, 5 * SIZE
  299. (p21) FMPY f14 = BETA, f121
  300. }
  301. { .mmf
  302. (p16) LDFD f56 = [CO1], 1 * SIZE
  303. (p16) LDFD f62 = [CO2], 1 * SIZE
  304. (p21) FMPY f15 = BETA, f127
  305. }
  306. ;;
  307. { .mmf
  308. (p21) STFD [DO1] = f6, 1 * SIZE
  309. (p21) STFD [DO2] = f7, 1 * SIZE
  310. (p20) FMPY f6 = BETA, f36
  311. }
  312. { .mmf
  313. (p16) LDFD f68 = [CO1], 5 * SIZE
  314. (p16) LDFD f74 = [CO2], 5 * SIZE
  315. (p20) FMPY f7 = BETA, f42
  316. }
  317. ;;
  318. { .mmf
  319. (p21) STFD [DO1] = f10, 1 * SIZE
  320. (p21) STFD [DO2] = f11, 1 * SIZE
  321. (p20) FMPY f10 = BETA, f48
  322. }
  323. { .mmf
  324. (p16) LDFD f80 = [CO1], 1 * SIZE
  325. (p16) LDFD f86 = [CO2], 1 * SIZE
  326. (p20) FMPY f11 = BETA, f54
  327. }
  328. ;;
  329. { .mmf
  330. (p21) STFD [DO1] = f12, 1 * SIZE
  331. (p21) STFD [DO2] = f13, 1 * SIZE
  332. (p20) FMPY f12 = BETA, f60
  333. }
  334. { .mmf
  335. (p16) LDFD f92 = [CO1], 1 * SIZE
  336. (p16) LDFD f98 = [CO2], 1 * SIZE
  337. (p20) FMPY f13 = BETA, f66
  338. }
  339. ;;
  340. { .mmf
  341. (p21) STFD [DO1] = f14, 5 * SIZE
  342. (p21) STFD [DO2] = f15, 5 * SIZE
  343. (p20) FMPY f14 = BETA, f72
  344. }
  345. { .mmf
  346. (p16) LDFD f104 = [CO1], 1 * SIZE
  347. (p16) LDFD f110 = [CO2], 1 * SIZE
  348. (p20) FMPY f15 = BETA, f78
  349. }
  350. ;;
  351. { .mmi
  352. (p16) LDFD f116 = [CO1], 5 * SIZE
  353. (p16) LDFD f122 = [CO2], 5 * SIZE
  354. adds PRE1 = 16 * SIZE, PRE1
  355. }
  356. { .mmb
  357. (p16) adds DO3 = 16 * SIZE, DO3
  358. nop.m 0
  359. br.ctop.sptk.few .L170
  360. }
  361. ;;
  362. .align 32
  363. .L180:
  364. { .mmi
  365. (p12) LDFD f32 = [CO1], 1 * SIZE
  366. (p12) LDFD f36 = [CO2], 1 * SIZE
  367. tbit.nz p13, p0 = M, 2
  368. }
  369. { .mmb
  370. cmp.eq p9, p0 = 0, I_AND_15
  371. adds J = -1, J
  372. (p9) br.cond.dptk .L199
  373. }
  374. ;;
  375. { .mmi
  376. (p12) LDFD f33 = [CO1], 1 * SIZE
  377. (p12) LDFD f37 = [CO2], 1 * SIZE
  378. tbit.nz p14, p0 = M, 1
  379. }
  380. ;;
  381. { .mmi
  382. (p12) LDFD f34 = [CO1], 1 * SIZE
  383. (p12) LDFD f38 = [CO2], 1 * SIZE
  384. (p12) adds CO3 = 8 * SIZE, CO3
  385. }
  386. ;;
  387. { .mmi
  388. (p12) LDFD f35 = [CO1], 5 * SIZE
  389. (p12) LDFD f39 = [CO2]
  390. (p13) adds CO3 = 4 * SIZE, CO3
  391. }
  392. ;;
  393. { .mmi
  394. (p13) LDFD f40 = [CO1], 1 * SIZE
  395. (p14) LDFD f44 = [CO3], 1 * SIZE
  396. }
  397. ;;
  398. { .mmi
  399. (p13) LDFD f41 = [CO1], 1 * SIZE
  400. (p14) LDFD f45 = [CO3], 1 * SIZE
  401. tbit.nz p15, p0 = M, 0
  402. }
  403. ;;
  404. { .mmf
  405. (p13) LDFD f42 = [CO1], 1 * SIZE
  406. (p15) LDFD f46 = [CO3]
  407. (p12) FMPY f32 = BETA, f32
  408. }
  409. { .mmf
  410. (p12) FMPY f36 = BETA, f36
  411. }
  412. ;;
  413. { .mmf
  414. (p13) LDFD f43 = [CO1]
  415. (p12) FMPY f33 = BETA, f33
  416. }
  417. { .mmf
  418. (p12) FMPY f37 = BETA, f37
  419. }
  420. ;;
  421. (p12) FMPY f34 = BETA, f34
  422. (p12) FMPY f38 = BETA, f38
  423. (p12) FMPY f35 = BETA, f35
  424. (p12) FMPY f39 = BETA, f39
  425. ;;
  426. { .mmf
  427. (p12) STFD [DO1] = f32, 1 * SIZE
  428. (p12) STFD [DO2] = f36, 1 * SIZE
  429. (p13) FMPY f40 = BETA, f40
  430. }
  431. { .mmf
  432. (p12) adds DO3 = 8 * SIZE, DO3
  433. (p14) FMPY f44 = BETA, f44
  434. }
  435. ;;
  436. { .mmf
  437. (p12) STFD [DO1] = f33, 1 * SIZE
  438. (p12) STFD [DO2] = f37, 1 * SIZE
  439. (p13) FMPY f41 = BETA, f41
  440. }
  441. { .mmf
  442. (p13) adds DO3 = 4 * SIZE, DO3
  443. (p14) FMPY f45 = BETA, f45
  444. }
  445. ;;
  446. { .mmf
  447. (p12) STFD [DO1] = f34, 1 * SIZE
  448. (p12) STFD [DO2] = f38, 1 * SIZE
  449. (p13) FMPY f42 = BETA, f42
  450. }
  451. { .mmf
  452. (p15) FMPY f46 = BETA, f46
  453. }
  454. ;;
  455. { .mmf
  456. (p12) STFD [DO1] = f35, 5 * SIZE
  457. (p12) STFD [DO2] = f39
  458. (p13) FMPY f43 = BETA, f43
  459. }
  460. ;;
  461. { .mmi
  462. (p13) STFD [DO1] = f40, 1 * SIZE
  463. (p14) STFD [DO3] = f44, 1 * SIZE
  464. }
  465. ;;
  466. { .mmi
  467. (p13) STFD [DO1] = f41, 1 * SIZE
  468. (p14) STFD [DO3] = f45, 1 * SIZE
  469. }
  470. ;;
  471. { .mmi
  472. (p13) STFD [DO1] = f42, 1 * SIZE
  473. (p15) STFD [DO3] = f46
  474. }
  475. ;;
  476. { .mmi
  477. (p13) STFD [DO1] = f43
  478. }
  479. ;;
  480. .align 32
  481. .L199:
  482. { .mib
  483. cmp.lt p6, p0 = 0, J
  484. mov ar.lc = ARLC
  485. (p6) br.cond.dptk .L100
  486. }
  487. ;;
  488. { .mib
  489. mov pr = PR, -1
  490. br.ret.sptk.many b0
  491. }
  492. ;;
  493. EPILOGUE