You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_beta.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE 74
  41. #define CO1 r14
  42. #define CO2 r15
  43. #define CO3 r16
  44. #define DO1 r17
  45. #define DO2 r18
  46. #define DO3 r19
  47. #define I r22
  48. #define I_AND_15 r23
  49. #define PRE1 r24
  50. #define PR r30
  51. #define ARLC r31
  52. #define M r32
  53. #define N r33
  54. #define C r34
  55. #define LDC r35
  56. #define J r36
  57. #define BETA_R f8
  58. #define BETA_I f9
  59. PROLOGUE
  60. .prologue
  61. PROFCODE
  62. { .mmi
  63. adds CO1 = 24, r12
  64. adds CO2 = 32, r12
  65. .save ar.lc, ARLC
  66. mov ARLC = ar.lc
  67. }
  68. { .mfb
  69. cmp.ge p6, p0 = 0, N
  70. fcmp.eq p0, p14 = BETA_R, f0
  71. (p6) br.ret.sptk.many b0
  72. }
  73. ;;
  74. .body
  75. { .mmi
  76. ld8 C = [CO1], 8
  77. ld8 LDC = [CO2]
  78. mov PR = pr
  79. }
  80. { .mfi
  81. mov J = N
  82. fcmp.eq p0, p15 = BETA_I, f0
  83. shr I = M, 3
  84. }
  85. ;;
  86. { .mmb
  87. cmp.ge p6, p0 = 0, M
  88. adds I = -1, I
  89. (p6) br.ret.sptk.many b0
  90. }
  91. ;;
  92. { .mbb
  93. shladd LDC = LDC, ZBASE_SHIFT, r0
  94. (p14) br.cond.dpnt .L100
  95. (p15) br.cond.dpnt .L100
  96. }
  97. ;;
  98. .align 32
  99. .L60:
  100. { .mmi
  101. mov CO1 = C
  102. mov CO3 = C
  103. add CO2 = 4 * SIZE, C
  104. }
  105. { .mmi
  106. adds PRE1 = PREFETCHSIZE * SIZE, C
  107. add C = C, LDC
  108. tbit.nz p12, p0 = M, 2
  109. }
  110. ;;
  111. { .mmi
  112. and I_AND_15 = 15, M
  113. mov ar.lc = I
  114. }
  115. { .mib
  116. cmp.gt p8, p0 = 0, I
  117. (p8) br.cond.dpnt .L80
  118. }
  119. ;;
  120. .align 32
  121. .L70:
  122. { .mmi
  123. STFD [CO1] = f0, 1 * SIZE
  124. STFD [CO2] = f0, 1 * SIZE
  125. }
  126. { .mmi
  127. lfetch.excl.nt1 [PRE1], 16 * SIZE
  128. nop.m 0
  129. }
  130. ;;
  131. { .mmi
  132. STFD [CO1] = f0, 1 * SIZE
  133. STFD [CO2] = f0, 1 * SIZE
  134. adds CO3 = 16 * SIZE, CO3
  135. }
  136. ;;
  137. { .mmi
  138. STFD [CO1] = f0, 1 * SIZE
  139. STFD [CO2] = f0, 1 * SIZE
  140. }
  141. ;;
  142. { .mmi
  143. STFD [CO1] = f0, 5 * SIZE
  144. STFD [CO2] = f0, 5 * SIZE
  145. }
  146. ;;
  147. { .mmi
  148. STFD [CO1] = f0, 1 * SIZE
  149. STFD [CO2] = f0, 1 * SIZE
  150. }
  151. ;;
  152. { .mmi
  153. STFD [CO1] = f0, 1 * SIZE
  154. STFD [CO2] = f0, 1 * SIZE
  155. }
  156. ;;
  157. { .mmi
  158. STFD [CO1] = f0, 1 * SIZE
  159. STFD [CO2] = f0, 1 * SIZE
  160. }
  161. ;;
  162. { .mmb
  163. STFD [CO1] = f0, 5 * SIZE
  164. STFD [CO2] = f0, 5 * SIZE
  165. br.cloop.sptk.few .L70
  166. }
  167. ;;
  168. .align 32
  169. .L80:
  170. { .mmi
  171. (p12) STFD [CO1] = f0, 1 * SIZE
  172. (p12) STFD [CO2] = f0, 1 * SIZE
  173. tbit.nz p13, p0 = M, 1
  174. }
  175. { .mmb
  176. cmp.eq p9, p0 = 0, I_AND_15
  177. adds J = -1, J
  178. (p9) br.cond.dptk .L99
  179. }
  180. ;;
  181. { .mmi
  182. (p12) STFD [CO1] = f0, 1 * SIZE
  183. (p12) STFD [CO2] = f0, 1 * SIZE
  184. tbit.nz p14, p0 = M, 0
  185. }
  186. ;;
  187. { .mmi
  188. (p12) STFD [CO1] = f0, 1 * SIZE
  189. (p12) STFD [CO2] = f0, 1 * SIZE
  190. (p12) adds CO3 = 8 * SIZE, CO3
  191. }
  192. ;;
  193. { .mmi
  194. (p12) STFD [CO1] = f0, 5 * SIZE
  195. (p12) STFD [CO2] = f0
  196. (p13) adds CO3 = 4 * SIZE, CO3
  197. }
  198. ;;
  199. { .mmi
  200. (p13) STFD [CO1] = f0, 1 * SIZE
  201. (p14) STFD [CO3] = f0, 1 * SIZE
  202. }
  203. ;;
  204. { .mmi
  205. (p13) STFD [CO1] = f0, 1 * SIZE
  206. (p14) STFD [CO3] = f0, 1 * SIZE
  207. }
  208. ;;
  209. { .mmi
  210. (p13) STFD [CO1] = f0, 1 * SIZE
  211. }
  212. ;;
  213. { .mmi
  214. (p13) STFD [CO1] = f0
  215. }
  216. ;;
  217. .align 32
  218. .L99:
  219. { .mib
  220. cmp.lt p6, p0 = 0, J
  221. mov ar.lc = ARLC
  222. }
  223. { .mbb
  224. (p6) br.cond.dptk .L60
  225. br.ret.sptk.many b0
  226. }
  227. ;;
  228. .align 32
  229. .L100:
  230. { .mmi
  231. mov CO1 = C
  232. mov CO3 = C
  233. mov pr.rot = 0
  234. }
  235. { .mmi
  236. adds PRE1 = PREFETCHSIZE * SIZE, C
  237. add CO2 = 4 * SIZE, C
  238. mov DO1 = C
  239. }
  240. ;;
  241. { .mmi
  242. mov ar.ec = 6
  243. }
  244. { .mmi
  245. adds DO2 = 4 * SIZE, C
  246. mov DO3 = C
  247. add C = C, LDC
  248. }
  249. ;;
  250. { .mmi
  251. and I_AND_15 = 15, M
  252. cmp.eq p16, p0 = r0, r0
  253. mov ar.lc = I
  254. }
  255. { .mib
  256. cmp.gt p8, p0 = 0, I
  257. tbit.nz p12, p0 = M, 2
  258. (p8) br.cond.dpnt .L180
  259. }
  260. ;;
  261. .align 32
  262. .L170:
  263. { .mmf
  264. (p21) STFD [DO1] = f37, 1 * SIZE
  265. (p16) lfetch.excl.nt1 [PRE1], 16 * SIZE
  266. (p21) FNMA f61 = BETA_I, f67, f61
  267. }
  268. { .mmf
  269. (p16) LDFD f32 = [CO1], 1 * SIZE
  270. (p16) adds CO2 = 16 * SIZE, CO2
  271. (p21) FMPY f12 = BETA_I, f85
  272. }
  273. ;;
  274. { .mfi
  275. (p21) STFD [DO1] = f43, 1 * SIZE
  276. (p21) FMA f67 = BETA_R, f67, f10
  277. (p16) adds CO3 = 16 * SIZE, CO3
  278. }
  279. { .mfi
  280. (p16) LDFD f38 = [CO1], 1 * SIZE
  281. (p21) FMPY f85 = BETA_R, f85
  282. (p16) adds DO2 = 16 * SIZE, DO2
  283. }
  284. ;;
  285. { .mfi
  286. (p21) STFD [DO1] = f49, 1 * SIZE
  287. (p21) FNMA f73 = BETA_I, f79, f73
  288. (p16) adds DO3 = 16 * SIZE, DO3
  289. }
  290. { .mfi
  291. (p16) LDFD f44 = [CO1], 1 * SIZE
  292. (p21) FMPY f13 = BETA_I, f97
  293. nop.i 0
  294. }
  295. ;;
  296. (p21) STFD [DO1] = f55, 1 * SIZE
  297. (p21) FMA f79 = BETA_R, f79, f11
  298. (p16) LDFD f50 = [CO1], 1 * SIZE
  299. (p21) FMPY f97 = BETA_R, f97
  300. ;;
  301. (p21) STFD [DO1] = f61, 1 * SIZE
  302. (p21) FNMA f85 = BETA_I, f91, f85
  303. (p16) LDFD f56 = [CO1], 1 * SIZE
  304. (p21) FMPY f14 = BETA_I, f109
  305. ;;
  306. (p21) STFD [DO1] = f67, 1 * SIZE
  307. (p21) FMA f91 = BETA_R, f91, f12
  308. (p16) LDFD f62 = [CO1], 1 * SIZE
  309. (p21) FMPY f109 = BETA_R, f109
  310. ;;
  311. (p21) STFD [DO1] = f73, 1 * SIZE
  312. (p21) FNMA f97 = BETA_I, f103, f97
  313. (p16) LDFD f68 = [CO1], 1 * SIZE
  314. (p21) FMPY f15 = BETA_I, f121
  315. ;;
  316. (p21) STFD [DO1] = f79, 1 * SIZE
  317. (p21) FMA f103 = BETA_R, f103, f13
  318. (p16) LDFD f74 = [CO1], 1 * SIZE
  319. (p21) FMPY f121 = BETA_R, f121
  320. ;;
  321. (p21) STFD [DO1] = f85, 1 * SIZE
  322. (p21) FNMA f109 = BETA_I, f115, f109
  323. (p16) LDFD f80 = [CO1], 1 * SIZE
  324. (p20) FMPY f6 = BETA_I, f36
  325. ;;
  326. (p21) STFD [DO1] = f91, 1 * SIZE
  327. (p21) FMA f115 = BETA_R, f115, f14
  328. (p16) LDFD f86 = [CO1], 1 * SIZE
  329. (p20) FMPY f36 = BETA_R, f36
  330. ;;
  331. (p21) STFD [DO1] = f97, 1 * SIZE
  332. (p21) FNMA f121 = BETA_I, f127, f121
  333. (p16) LDFD f92 = [CO1], 1 * SIZE
  334. (p20) FMPY f7 = BETA_I, f48
  335. ;;
  336. (p21) STFD [DO1] = f103, 1 * SIZE
  337. (p21) FMA f127 = BETA_R, f127, f15
  338. (p16) LDFD f98 = [CO1], 1 * SIZE
  339. (p20) FMPY f48 = BETA_R, f48
  340. ;;
  341. (p21) STFD [DO1] = f109, 1 * SIZE
  342. (p20) FNMA f36 = BETA_I, f42, f36
  343. (p16) LDFD f104 = [CO1], 1 * SIZE
  344. (p20) FMPY f10 = BETA_I, f60
  345. ;;
  346. (p21) STFD [DO1] = f115, 1 * SIZE
  347. (p20) FMA f42 = BETA_R, f42, f6
  348. (p16) LDFD f110 = [CO1], 1 * SIZE
  349. (p20) FMPY f60 = BETA_R, f60
  350. ;;
  351. (p21) STFD [DO1] = f121, 1 * SIZE
  352. (p20) FNMA f48 = BETA_I, f54, f48
  353. (p16) LDFD f116 = [CO1], 1 * SIZE
  354. (p20) FMPY f11 = BETA_I, f72
  355. ;;
  356. (p21) STFD [DO1] = f127, 1 * SIZE
  357. (p20) FMA f54 = BETA_R, f54, f7
  358. (p16) LDFD f122 = [CO1], 1 * SIZE
  359. (p20) FMPY f72 = BETA_R, f72
  360. br.ctop.sptk.few .L170
  361. ;;
  362. .align 32
  363. .L180:
  364. { .mmi
  365. (p12) LDFD f32 = [CO1], 1 * SIZE
  366. (p12) LDFD f36 = [CO2], 1 * SIZE
  367. tbit.nz p13, p0 = M, 1
  368. }
  369. { .mmb
  370. cmp.eq p9, p0 = 0, I_AND_15
  371. adds J = -1, J
  372. (p9) br.cond.dptk .L199
  373. }
  374. ;;
  375. { .mmi
  376. (p12) LDFD f33 = [CO1], 1 * SIZE
  377. (p12) LDFD f37 = [CO2], 1 * SIZE
  378. tbit.nz p14, p0 = M, 0
  379. }
  380. ;;
  381. { .mmi
  382. (p12) LDFD f34 = [CO1], 1 * SIZE
  383. (p12) LDFD f38 = [CO2], 1 * SIZE
  384. (p12) adds CO3 = 8 * SIZE, CO3
  385. }
  386. ;;
  387. { .mmi
  388. (p12) LDFD f35 = [CO1], 5 * SIZE
  389. (p12) LDFD f39 = [CO2]
  390. (p13) adds CO3 = 4 * SIZE, CO3
  391. }
  392. ;;
  393. { .mmi
  394. (p13) LDFD f40 = [CO1], 1 * SIZE
  395. (p14) LDFD f44 = [CO3], 1 * SIZE
  396. }
  397. ;;
  398. { .mmi
  399. (p13) LDFD f41 = [CO1], 1 * SIZE
  400. (p14) LDFD f45 = [CO3], 1 * SIZE
  401. }
  402. ;;
  403. { .mmf
  404. (p13) LDFD f42 = [CO1], 1 * SIZE
  405. }
  406. ;;
  407. { .mmf
  408. (p13) LDFD f43 = [CO1]
  409. }
  410. ;;
  411. (p12) FMPY f80 = BETA_I, f32
  412. (p12) FMPY f32 = BETA_R, f32
  413. (p12) FMPY f81 = BETA_I, f34
  414. (p12) FMPY f34 = BETA_R, f34
  415. (p12) FMPY f82 = BETA_I, f36
  416. (p12) FMPY f36 = BETA_R, f36
  417. (p12) FMPY f83 = BETA_I, f38
  418. (p12) FMPY f38 = BETA_R, f38
  419. ;;
  420. (p12) FNMA f32 = BETA_I, f33, f32
  421. (p12) FMA f33 = BETA_R, f33, f80
  422. (p12) FNMA f34 = BETA_I, f35, f34
  423. (p12) FMA f35 = BETA_R, f35, f81
  424. (p12) FNMA f36 = BETA_I, f37, f36
  425. (p12) FMA f37 = BETA_R, f37, f82
  426. (p12) FNMA f38 = BETA_I, f39, f38
  427. (p12) FMA f39 = BETA_R, f39, f83
  428. ;;
  429. (p13) FMPY f84 = BETA_I, f40
  430. (p13) FMPY f40 = BETA_R, f40
  431. (p13) FMPY f85 = BETA_I, f42
  432. (p13) FMPY f42 = BETA_R, f42
  433. (p14) FMPY f86 = BETA_I, f44
  434. (p14) FMPY f44 = BETA_R, f44
  435. ;;
  436. (p13) FNMA f40 = BETA_I, f41, f40
  437. (p13) FMA f41 = BETA_R, f41, f84
  438. (p13) FNMA f42 = BETA_I, f43, f42
  439. (p13) FMA f43 = BETA_R, f43, f85
  440. (p14) FNMA f44 = BETA_I, f45, f44
  441. (p14) FMA f45 = BETA_R, f45, f86
  442. ;;
  443. { .mmf
  444. (p12) STFD [DO1] = f32, 1 * SIZE
  445. (p12) STFD [DO2] = f36, 1 * SIZE
  446. }
  447. { .mmf
  448. (p12) adds DO3 = 8 * SIZE, DO3
  449. }
  450. ;;
  451. { .mmf
  452. (p12) STFD [DO1] = f33, 1 * SIZE
  453. (p12) STFD [DO2] = f37, 1 * SIZE
  454. }
  455. { .mmf
  456. (p13) adds DO3 = 4 * SIZE, DO3
  457. }
  458. ;;
  459. { .mmf
  460. (p12) STFD [DO1] = f34, 1 * SIZE
  461. (p12) STFD [DO2] = f38, 1 * SIZE
  462. }
  463. ;;
  464. { .mmf
  465. (p12) STFD [DO1] = f35, 5 * SIZE
  466. (p12) STFD [DO2] = f39
  467. }
  468. ;;
  469. { .mmi
  470. (p13) STFD [DO1] = f40, 1 * SIZE
  471. (p14) STFD [DO3] = f44, 1 * SIZE
  472. }
  473. ;;
  474. { .mmi
  475. (p13) STFD [DO1] = f41, 1 * SIZE
  476. (p14) STFD [DO3] = f45, 1 * SIZE
  477. }
  478. ;;
  479. { .mmi
  480. (p13) STFD [DO1] = f42, 1 * SIZE
  481. ;;
  482. (p13) STFD [DO1] = f43
  483. }
  484. ;;
  485. .align 32
  486. .L199:
  487. { .mib
  488. cmp.lt p6, p0 = 0, J
  489. mov ar.lc = ARLC
  490. (p6) br.cond.dptk .L100
  491. }
  492. ;;
  493. { .mib
  494. mov pr = PR, -1
  495. br.ret.sptk.many b0
  496. }
  497. ;;
  498. EPILOGUE