You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm3m_level3.c 15 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #ifndef BETA_OPERATION
  41. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  42. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  43. BETA[0], BETA[1], NULL, 0, NULL, 0, \
  44. (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
  45. #endif
  46. #ifndef ICOPYB_OPERATION
  47. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  48. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  49. #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  50. GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER)
  51. #else
  52. #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  53. GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER)
  54. #endif
  55. #endif
  56. #ifndef ICOPYR_OPERATION
  57. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  58. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  59. #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  60. GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER)
  61. #else
  62. #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  63. GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER)
  64. #endif
  65. #endif
  66. #ifndef ICOPYI_OPERATION
  67. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  68. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  69. #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  70. GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER)
  71. #else
  72. #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  73. GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER)
  74. #endif
  75. #endif
  76. #ifndef OCOPYB_OPERATION
  77. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  78. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  79. #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  80. GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
  81. #else
  82. #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  83. GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
  84. #endif
  85. #endif
  86. #ifndef OCOPYR_OPERATION
  87. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  88. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  89. #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  90. GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
  91. #else
  92. #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  93. GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
  94. #endif
  95. #endif
  96. #ifndef OCOPYI_OPERATION
  97. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  98. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  99. #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  100. GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
  101. #else
  102. #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  103. GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER)
  104. #endif
  105. #endif
  106. #ifndef KERNEL_FUNC
  107. #define KERNEL_FUNC GEMM3M_KERNEL
  108. #endif
  109. #ifndef KERNEL_OPERATION
  110. #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \
  111. KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  112. #endif
  113. #ifndef A
  114. #define A args -> a
  115. #endif
  116. #ifndef LDA
  117. #define LDA args -> lda
  118. #endif
  119. #ifndef B
  120. #define B args -> b
  121. #endif
  122. #ifndef LDB
  123. #define LDB args -> ldb
  124. #endif
  125. #ifndef C
  126. #define C args -> c
  127. #endif
  128. #ifndef LDC
  129. #define LDC args -> ldc
  130. #endif
  131. #ifndef M
  132. #define M args -> m
  133. #endif
  134. #ifndef N
  135. #define N args -> n
  136. #endif
  137. #ifndef K
  138. #define K args -> k
  139. #endif
  140. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  141. #define ALPHA1 ONE
  142. #define ALPHA2 ONE
  143. #define ALPHA5 ZERO
  144. #define ALPHA6 ONE
  145. #define ALPHA7 ONE
  146. #define ALPHA8 ZERO
  147. #define ALPHA11 ONE
  148. #define ALPHA12 -ONE
  149. #define ALPHA13 ZERO
  150. #define ALPHA14 ONE
  151. #define ALPHA17 -ONE
  152. #define ALPHA18 -ONE
  153. #endif
  154. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  155. #define ALPHA1 ONE
  156. #define ALPHA2 ONE
  157. #define ALPHA5 ONE
  158. #define ALPHA6 ZERO
  159. #define ALPHA7 ZERO
  160. #define ALPHA8 ONE
  161. #define ALPHA11 -ONE
  162. #define ALPHA12 -ONE
  163. #define ALPHA13 ONE
  164. #define ALPHA14 ZERO
  165. #define ALPHA17 -ONE
  166. #define ALPHA18 ONE
  167. #endif
  168. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  169. #define ALPHA1 ONE
  170. #define ALPHA2 ONE
  171. #define ALPHA5 ONE
  172. #define ALPHA6 ZERO
  173. #define ALPHA7 ZERO
  174. #define ALPHA8 ONE
  175. #define ALPHA11 -ONE
  176. #define ALPHA12 ONE
  177. #define ALPHA13 ONE
  178. #define ALPHA14 ZERO
  179. #define ALPHA17 -ONE
  180. #define ALPHA18 -ONE
  181. #endif
  182. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  183. #define ALPHA1 ONE
  184. #define ALPHA2 ONE
  185. #define ALPHA5 ZERO
  186. #define ALPHA6 -ONE
  187. #define ALPHA7 ONE
  188. #define ALPHA8 ZERO
  189. #define ALPHA11 ONE
  190. #define ALPHA12 ONE
  191. #define ALPHA13 ZERO
  192. #define ALPHA14 ONE
  193. #define ALPHA17 -ONE
  194. #define ALPHA18 ONE
  195. #endif
  196. #ifdef TIMING
  197. #define START_RPCC() rpcc_counter = rpcc()
  198. #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
  199. #else
  200. #define START_RPCC()
  201. #define STOP_RPCC(COUNTER)
  202. #endif
  203. int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
  204. FLOAT *sa, FLOAT *sb, BLASLONG dummy){
  205. BLASLONG k, lda, ldb, ldc;
  206. FLOAT *alpha, *beta;
  207. FLOAT *a, *b, *c;
  208. BLASLONG m_from, m_to, n_from, n_to;
  209. BLASLONG ls, is, js, jjs;
  210. BLASLONG min_l, min_i, min_j, min_jj;
  211. #ifdef TIMING
  212. BLASULONG rpcc_counter;
  213. BLASULONG BLASLONG innercost = 0;
  214. BLASULONG BLASLONG outercost = 0;
  215. BLASULONG BLASLONG kernelcost = 0;
  216. double total;
  217. #endif
  218. k = K;
  219. a = (FLOAT *)A;
  220. b = (FLOAT *)B;
  221. c = (FLOAT *)C;
  222. lda = LDA;
  223. ldb = LDB;
  224. ldc = LDC;
  225. alpha = (FLOAT *)args -> alpha;
  226. beta = (FLOAT *)args -> beta;
  227. m_from = 0;
  228. m_to = M;
  229. if (range_m) {
  230. m_from = *(((BLASLONG *)range_m) + 0);
  231. m_to = *(((BLASLONG *)range_m) + 1);
  232. }
  233. n_from = 0;
  234. n_to = N;
  235. if (range_n) {
  236. n_from = *(((BLASLONG *)range_n) + 0);
  237. n_to = *(((BLASLONG *)range_n) + 1);
  238. }
  239. if (beta) {
  240. #ifndef COMPLEX
  241. if (beta[0] != ONE)
  242. #else
  243. if ((beta[0] != ONE) || (beta[1] != ZERO))
  244. #endif
  245. BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc);
  246. }
  247. if ((k == 0) || (alpha == NULL)) return 0;
  248. if ((alpha[0] == ZERO)
  249. #ifdef COMPLEX
  250. && (alpha[1] == ZERO)
  251. #endif
  252. ) return 0;
  253. #if 0
  254. printf("GEMM: M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k);
  255. printf("GEMM: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM3M_P, (BLASLONG)GEMM3M_Q, (BLASLONG)GEMM3M_R);
  256. printf("GEMM: SA .. %p SB .. %p\n", sa, sb);
  257. #endif
  258. #ifdef TIMING
  259. innercost = 0;
  260. outercost = 0;
  261. kernelcost = 0;
  262. #endif
  263. for(js = n_from; js < n_to; js += GEMM3M_R){
  264. min_j = n_to - js;
  265. if (min_j > GEMM3M_R) min_j = GEMM3M_R;
  266. for(ls = 0; ls < k; ls += min_l){
  267. min_l = k - ls;
  268. if (min_l >= GEMM3M_Q * 2) {
  269. min_l = GEMM3M_Q;
  270. } else {
  271. if (min_l > GEMM3M_Q) {
  272. min_l = (min_l + 1) / 2;
  273. #ifdef UNROLL_X
  274. min_l = ((min_l + UNROLL_X - 1)/UNROLL_X) * UNROLL_X;
  275. #endif
  276. }
  277. }
  278. min_i = m_to - m_from;
  279. if (min_i >= GEMM3M_P * 2) {
  280. min_i = GEMM3M_P;
  281. } else {
  282. if (min_i > GEMM3M_P) {
  283. min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  284. }
  285. }
  286. START_RPCC();
  287. ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  288. STOP_RPCC(innercost);
  289. for(jjs = js; jjs < js + min_j; jjs += min_jj){
  290. min_jj = min_j + js - jjs;
  291. if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
  292. START_RPCC();
  293. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
  294. OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
  295. #else
  296. OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
  297. #endif
  298. STOP_RPCC(outercost);
  299. START_RPCC();
  300. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
  301. sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
  302. STOP_RPCC(kernelcost);
  303. }
  304. for(is = m_from + min_i; is < m_to; is += min_i){
  305. min_i = m_to - is;
  306. if (min_i >= GEMM3M_P * 2) {
  307. min_i = GEMM3M_P;
  308. } else
  309. if (min_i > GEMM3M_P) {
  310. min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  311. }
  312. START_RPCC();
  313. ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  314. STOP_RPCC(innercost);
  315. START_RPCC();
  316. KERNEL_OPERATION(min_i, min_j, min_l, ALPHA5, ALPHA6, sa, sb, c, ldc, is, js);
  317. STOP_RPCC(kernelcost);
  318. }
  319. min_i = m_to - m_from;
  320. if (min_i >= GEMM3M_P * 2) {
  321. min_i = GEMM3M_P;
  322. } else {
  323. if (min_i > GEMM3M_P) {
  324. min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  325. }
  326. }
  327. START_RPCC();
  328. ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  329. STOP_RPCC(innercost);
  330. for(jjs = js; jjs < js + min_j; jjs += min_jj){
  331. min_jj = min_j + js - jjs;
  332. if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
  333. START_RPCC();
  334. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  335. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
  336. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  337. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
  338. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  339. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
  340. #else
  341. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
  342. #endif
  343. STOP_RPCC(outercost);
  344. START_RPCC();
  345. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
  346. sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
  347. STOP_RPCC(kernelcost);
  348. }
  349. for(is = m_from + min_i; is < m_to; is += min_i){
  350. min_i = m_to - is;
  351. if (min_i >= GEMM3M_P * 2) {
  352. min_i = GEMM3M_P;
  353. } else
  354. if (min_i > GEMM3M_P) {
  355. min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  356. }
  357. START_RPCC();
  358. ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  359. STOP_RPCC(innercost);
  360. START_RPCC();
  361. KERNEL_OPERATION(min_i, min_j, min_l, ALPHA11, ALPHA12, sa, sb, c, ldc, is, js);
  362. STOP_RPCC(kernelcost);
  363. }
  364. min_i = m_to - m_from;
  365. if (min_i >= GEMM3M_P * 2) {
  366. min_i = GEMM3M_P;
  367. } else {
  368. if (min_i > GEMM3M_P) {
  369. min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  370. }
  371. }
  372. START_RPCC();
  373. ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  374. STOP_RPCC(innercost);
  375. for(jjs = js; jjs < js + min_j; jjs += min_jj){
  376. min_jj = min_j + js - jjs;
  377. if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
  378. START_RPCC();
  379. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  380. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
  381. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  382. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
  383. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  384. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
  385. #else
  386. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
  387. #endif
  388. STOP_RPCC(outercost);
  389. START_RPCC();
  390. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
  391. sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
  392. STOP_RPCC(kernelcost);
  393. }
  394. for(is = m_from + min_i; is < m_to; is += min_i){
  395. min_i = m_to - is;
  396. if (min_i >= GEMM3M_P * 2) {
  397. min_i = GEMM3M_P;
  398. } else
  399. if (min_i > GEMM3M_P) {
  400. min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  401. }
  402. START_RPCC();
  403. ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  404. STOP_RPCC(innercost);
  405. START_RPCC();
  406. KERNEL_OPERATION(min_i, min_j, min_l, ALPHA17, ALPHA18, sa, sb, c, ldc, is, js);
  407. STOP_RPCC(kernelcost);
  408. }
  409. } /* end of js */
  410. } /* end of ls */
  411. #ifdef TIMING
  412. total = (double)outercost + (double)innercost + (double)kernelcost;
  413. printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n",
  414. innercost / total * 100., outercost / total * 100.,
  415. kernelcost / total * 100.);
  416. printf( " Total %10.3f%% %10.3f MFlops\n",
  417. ((double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost / 2 * 100,
  418. 2400. * (2. * (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost);
  419. #endif
  420. return 0;
  421. }