You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

level3_thread.c 24 kB

Only initialize the part of the jobs array that will get used The jobs array is getting initialized in O(compiled cpus^2) complexity. Distros and people with bigger systems will use pretty high values (128 or 256 or more) for this value, leading to interesting bubbles in performance. Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle in the interesting range (threading kicks in at 65x65 mult by 65x65). The hardware is capable of 32 multiplications per cycle theoretically. Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10703.9 10.6 0.0% 17990.6 6.3 0.0% 64 x 64 20778.4 12.8 0.0% 40629.2 6.5 0.0% 65 x 65 26869.9 10.3 0.0% 52545.7 5.3 0.0% 80 x 80 38104.5 13.5 0.0% 72492.7 7.1 0.0% 96 x 96 61626.4 14.4 0.0% 113983.8 7.8 0.0% 112 x 112 91803.8 15.3 0.0% 180987.3 7.8 0.0% 128 x 128 133161.4 15.8 0.0% 258374.3 8.1 0.0% When threading is turned on TARGET=SKYLAKEX F_COMPILER=GFORTRAN SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0 NUM_THREADS=128 Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10725.9 10.5 -0.2% 18134.9 6.2 -0.8% 64 x 64 20500.6 12.9 1.3% 40929.1 6.5 -0.7% 65 x 65 2040832.1 0.1 -7495.2% 2097633.6 0.1 -3892.0% 80 x 80 2063129.1 0.2 -5314.4% 2119925.2 0.2 -2824.3% 96 x 96 2070374.5 0.4 -3259.6% 2173604.4 0.4 -1806.9% 112 x 112 2111721.5 0.7 -2169.6% 2263330.8 0.6 -1170.0% 128 x 128 2276181.5 0.9 -1609.3% 2377228.9 0.9 -820.1% There is a deep deep cliff once you hit 65x65 With this patch Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% The cliff is very significantly reduced. (more to follow)
7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #ifndef CACHE_LINE_SIZE
  39. #define CACHE_LINE_SIZE 8
  40. #endif
  41. #ifndef DIVIDE_RATE
  42. #define DIVIDE_RATE 2
  43. #endif
  44. #ifndef SWITCH_RATIO
  45. #define SWITCH_RATIO 2
  46. #endif
  47. #ifndef GEMM_PREFERED_SIZE
  48. #define GEMM_PREFERED_SIZE 1
  49. #endif
  50. //The array of job_t may overflow the stack.
  51. //Instead, use malloc to alloc job_t.
  52. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
  53. #define USE_ALLOC_HEAP
  54. #endif
  55. #ifndef GEMM_LOCAL
  56. #if defined(NN)
  57. #define GEMM_LOCAL GEMM_NN
  58. #elif defined(NT)
  59. #define GEMM_LOCAL GEMM_NT
  60. #elif defined(NR)
  61. #define GEMM_LOCAL GEMM_NR
  62. #elif defined(NC)
  63. #define GEMM_LOCAL GEMM_NC
  64. #elif defined(TN)
  65. #define GEMM_LOCAL GEMM_TN
  66. #elif defined(TT)
  67. #define GEMM_LOCAL GEMM_TT
  68. #elif defined(TR)
  69. #define GEMM_LOCAL GEMM_TR
  70. #elif defined(TC)
  71. #define GEMM_LOCAL GEMM_TC
  72. #elif defined(RN)
  73. #define GEMM_LOCAL GEMM_RN
  74. #elif defined(RT)
  75. #define GEMM_LOCAL GEMM_RT
  76. #elif defined(RR)
  77. #define GEMM_LOCAL GEMM_RR
  78. #elif defined(RC)
  79. #define GEMM_LOCAL GEMM_RC
  80. #elif defined(CN)
  81. #define GEMM_LOCAL GEMM_CN
  82. #elif defined(CT)
  83. #define GEMM_LOCAL GEMM_CT
  84. #elif defined(CR)
  85. #define GEMM_LOCAL GEMM_CR
  86. #elif defined(CC)
  87. #define GEMM_LOCAL GEMM_CC
  88. #endif
  89. #endif
  90. typedef struct {
  91. volatile
  92. BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
  93. } job_t;
  94. #ifndef BETA_OPERATION
  95. #ifndef COMPLEX
  96. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  97. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  98. BETA[0], NULL, 0, NULL, 0, \
  99. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  100. #else
  101. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  102. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  103. BETA[0], BETA[1], NULL, 0, NULL, 0, \
  104. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  105. #endif
  106. #endif
  107. #ifndef ICOPY_OPERATION
  108. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  109. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  110. #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  111. #else
  112. #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  113. #endif
  114. #endif
  115. #ifndef OCOPY_OPERATION
  116. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  117. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  118. #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  119. #else
  120. #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  121. #endif
  122. #endif
  123. #ifndef KERNEL_FUNC
  124. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  125. #define KERNEL_FUNC GEMM_KERNEL_N
  126. #endif
  127. #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
  128. #define KERNEL_FUNC GEMM_KERNEL_L
  129. #endif
  130. #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
  131. #define KERNEL_FUNC GEMM_KERNEL_R
  132. #endif
  133. #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
  134. #define KERNEL_FUNC GEMM_KERNEL_B
  135. #endif
  136. #endif
  137. #ifndef KERNEL_OPERATION
  138. #ifndef COMPLEX
  139. #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
  140. KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  141. #else
  142. #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
  143. KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  144. #endif
  145. #endif
  146. #ifndef FUSED_KERNEL_OPERATION
  147. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  148. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  149. #ifndef COMPLEX
  150. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  151. FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \
  152. (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  153. #else
  154. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  155. FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \
  156. (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  157. #endif
  158. #else
  159. #ifndef COMPLEX
  160. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  161. FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \
  162. (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  163. #else
  164. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  165. FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \
  166. (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  167. #endif
  168. #endif
  169. #endif
  170. #ifndef A
  171. #define A args -> a
  172. #endif
  173. #ifndef LDA
  174. #define LDA args -> lda
  175. #endif
  176. #ifndef B
  177. #define B args -> b
  178. #endif
  179. #ifndef LDB
  180. #define LDB args -> ldb
  181. #endif
  182. #ifndef C
  183. #define C args -> c
  184. #endif
  185. #ifndef LDC
  186. #define LDC args -> ldc
  187. #endif
  188. #ifndef M
  189. #define M args -> m
  190. #endif
  191. #ifndef N
  192. #define N args -> n
  193. #endif
  194. #ifndef K
  195. #define K args -> k
  196. #endif
  197. #ifdef TIMING
  198. #define START_RPCC() rpcc_counter = rpcc()
  199. #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
  200. #else
  201. #define START_RPCC()
  202. #define STOP_RPCC(COUNTER)
  203. #endif
  204. static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){
  205. IFLOAT *buffer[DIVIDE_RATE];
  206. BLASLONG k, lda, ldb, ldc;
  207. BLASLONG m_from, m_to, n_from, n_to;
  208. FLOAT *alpha, *beta;
  209. IFLOAT *a, *b;
  210. FLOAT *c;
  211. job_t *job = (job_t *)args -> common;
  212. BLASLONG nthreads_m;
  213. BLASLONG mypos_m, mypos_n;
  214. BLASLONG is, js, ls, bufferside, jjs;
  215. BLASLONG min_i, min_l, div_n, min_jj;
  216. BLASLONG i, current;
  217. BLASLONG l1stride;
  218. #ifdef TIMING
  219. BLASULONG rpcc_counter;
  220. BLASULONG copy_A = 0;
  221. BLASULONG copy_B = 0;
  222. BLASULONG kernel = 0;
  223. BLASULONG waiting1 = 0;
  224. BLASULONG waiting2 = 0;
  225. BLASULONG waiting3 = 0;
  226. BLASULONG waiting6[MAX_CPU_NUMBER];
  227. BLASULONG ops = 0;
  228. for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
  229. #endif
  230. k = K;
  231. a = (IFLOAT *)A;
  232. b = (IFLOAT *)B;
  233. c = (FLOAT *)C;
  234. lda = LDA;
  235. ldb = LDB;
  236. ldc = LDC;
  237. alpha = (FLOAT *)args -> alpha;
  238. beta = (FLOAT *)args -> beta;
  239. /* Initialize 2D CPU distribution */
  240. nthreads_m = args -> nthreads;
  241. if (range_m) {
  242. nthreads_m = range_m[-1];
  243. }
  244. mypos_n = blas_quickdivide(mypos, nthreads_m); /* mypos_n = mypos / nthreads_m */
  245. mypos_m = mypos - mypos_n * nthreads_m; /* mypos_m = mypos % nthreads_m */
  246. /* Initialize m and n */
  247. m_from = 0;
  248. m_to = M;
  249. if (range_m) {
  250. m_from = range_m[mypos_m + 0];
  251. m_to = range_m[mypos_m + 1];
  252. }
  253. n_from = 0;
  254. n_to = N;
  255. if (range_n) {
  256. n_from = range_n[mypos + 0];
  257. n_to = range_n[mypos + 1];
  258. }
  259. /* Multiply C by beta if needed */
  260. if (beta) {
  261. #ifndef COMPLEX
  262. if (beta[0] != ONE)
  263. #else
  264. if ((beta[0] != ONE) || (beta[1] != ZERO))
  265. #endif
  266. BETA_OPERATION(m_from, m_to, range_n[mypos_n * nthreads_m], range_n[(mypos_n + 1) * nthreads_m], beta, c, ldc);
  267. }
  268. /* Return early if no more computation is needed */
  269. if ((k == 0) || (alpha == NULL)) return 0;
  270. if (alpha[0] == ZERO
  271. #ifdef COMPLEX
  272. && alpha[1] == ZERO
  273. #endif
  274. ) return 0;
  275. /* Initialize workspace for local region of B */
  276. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  277. buffer[0] = sb;
  278. for (i = 1; i < DIVIDE_RATE; i++) {
  279. buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
  280. }
  281. /* Iterate through steps of k */
  282. for(ls = 0; ls < k; ls += min_l){
  283. /* Determine step size in k */
  284. min_l = k - ls;
  285. if (min_l >= GEMM_Q * 2) {
  286. min_l = GEMM_Q;
  287. } else {
  288. if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
  289. }
  290. /* Determine step size in m
  291. * Note: We are currently on the first step in m
  292. */
  293. l1stride = 1;
  294. min_i = m_to - m_from;
  295. if (min_i >= GEMM_P * 2) {
  296. min_i = GEMM_P;
  297. } else {
  298. if (min_i > GEMM_P) {
  299. min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
  300. } else {
  301. if (args -> nthreads == 1) l1stride = 0;
  302. }
  303. }
  304. /* Copy local region of A into workspace */
  305. START_RPCC();
  306. ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  307. STOP_RPCC(copy_A);
  308. /* Copy local region of B into workspace and apply kernel */
  309. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  310. for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
  311. /* Make sure if no one is using workspace */
  312. START_RPCC();
  313. for (i = 0; i < args -> nthreads; i++)
  314. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
  315. STOP_RPCC(waiting1);
  316. MB;
  317. #if defined(FUSED_GEMM) && !defined(TIMING)
  318. /* Fused operation to copy region of B into workspace and apply kernel */
  319. FUSED_KERNEL_OPERATION(min_i, MIN(n_to, js + div_n) - js, min_l, alpha,
  320. sa, buffer[bufferside], b, ldb, c, ldc, m_from, js, ls);
  321. #else
  322. /* Split local region of B into parts */
  323. for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
  324. min_jj = MIN(n_to, js + div_n) - jjs;
  325. #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
  326. /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
  327. if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
  328. #else
  329. if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
  330. else
  331. /*
  332. if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
  333. else
  334. */
  335. if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
  336. #endif
  337. /* Copy part of local region of B into workspace */
  338. START_RPCC();
  339. OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
  340. buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride);
  341. STOP_RPCC(copy_B);
  342. /* Apply kernel with local region of A and part of local region of B */
  343. START_RPCC();
  344. KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
  345. sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride,
  346. c, ldc, m_from, jjs);
  347. STOP_RPCC(kernel);
  348. #ifdef TIMING
  349. ops += 2 * min_i * min_jj * min_l;
  350. #endif
  351. }
  352. #endif
  353. WMB;
  354. /* Set flag so other threads can access local region of B */
  355. for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
  356. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  357. }
  358. /* Get regions of B from other threads and apply kernel */
  359. current = mypos;
  360. do {
  361. /* This thread accesses regions of B from threads in the range
  362. * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */
  363. current ++;
  364. if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;
  365. /* Split other region of B into parts */
  366. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  367. for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
  368. if (current != mypos) {
  369. /* Wait until other region of B is initialized */
  370. START_RPCC();
  371. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
  372. STOP_RPCC(waiting2);
  373. MB;
  374. /* Apply kernel with local region of A and part of other region of B */
  375. START_RPCC();
  376. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
  377. sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  378. c, ldc, m_from, js);
  379. STOP_RPCC(kernel);
  380. #ifdef TIMING
  381. ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
  382. #endif
  383. }
  384. /* Clear synchronization flag if this thread is done with other region of B */
  385. if (m_to - m_from == min_i) {
  386. WMB;
  387. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  388. }
  389. }
  390. } while (current != mypos);
  391. /* Iterate through steps of m
  392. * Note: First step has already been finished */
  393. for(is = m_from + min_i; is < m_to; is += min_i){
  394. min_i = m_to - is;
  395. if (min_i >= GEMM_P * 2) {
  396. min_i = GEMM_P;
  397. } else
  398. if (min_i > GEMM_P) {
  399. min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
  400. }
  401. /* Copy local region of A into workspace */
  402. START_RPCC();
  403. ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  404. STOP_RPCC(copy_A);
  405. /* Get regions of B and apply kernel */
  406. current = mypos;
  407. do {
  408. /* Split region of B into parts and apply kernel */
  409. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  410. for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
  411. /* Apply kernel with local region of A and part of region of B */
  412. START_RPCC();
  413. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
  414. sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  415. c, ldc, is, js);
  416. STOP_RPCC(kernel);
  417. #ifdef TIMING
  418. ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
  419. #endif
  420. /* Clear synchronization flag if this thread is done with region of B */
  421. if (is + min_i >= m_to) {
  422. WMB;
  423. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  424. }
  425. }
  426. /* This thread accesses regions of B from threads in the range
  427. * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */
  428. current ++;
  429. if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;
  430. } while (current != mypos);
  431. }
  432. }
  433. /* Wait until all other threads are done with local region of B */
  434. START_RPCC();
  435. for (i = 0; i < args -> nthreads; i++) {
  436. for (js = 0; js < DIVIDE_RATE; js++) {
  437. while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
  438. }
  439. }
  440. STOP_RPCC(waiting3);
  441. MB;
  442. #ifdef TIMING
  443. BLASLONG waiting = waiting1 + waiting2 + waiting3;
  444. BLASLONG total = copy_A + copy_B + kernel + waiting;
  445. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
  446. mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
  447. (double)waiting1 /(double)total * 100.,
  448. (double)waiting2 /(double)total * 100.,
  449. (double)waiting3 /(double)total * 100.,
  450. (double)ops/(double)kernel / 4. * 100.);
  451. fprintf(stderr, "\n");
  452. #endif
  453. return 0;
  454. }
  455. static int round_up(int remainder, int width, int multiple)
  456. {
  457. if (multiple > remainder || width <= multiple)
  458. return width;
  459. width = (width + multiple - 1) / multiple;
  460. width = width * multiple;
  461. return width;
  462. }
  463. static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  464. *range_n, IFLOAT *sa, IFLOAT *sb,
  465. BLASLONG nthreads_m, BLASLONG nthreads_n) {
  466. #ifndef USE_OPENMP
  467. #ifndef OS_WINDOWS
  468. static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
  469. #else
  470. CRITICAL_SECTION level3_lock;
  471. InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
  472. #endif
  473. #endif
  474. blas_arg_t newarg;
  475. #ifndef USE_ALLOC_HEAP
  476. job_t job[MAX_CPU_NUMBER];
  477. #else
  478. job_t * job = NULL;
  479. #endif
  480. blas_queue_t queue[MAX_CPU_NUMBER];
  481. BLASLONG range_M_buffer[MAX_CPU_NUMBER + 2];
  482. BLASLONG range_N_buffer[MAX_CPU_NUMBER + 2];
  483. BLASLONG *range_M, *range_N;
  484. BLASLONG num_parts;
  485. BLASLONG nthreads = args -> nthreads;
  486. BLASLONG width, i, j, k, js;
  487. BLASLONG m, n, n_from, n_to;
  488. int mode;
  489. /* Get execution mode */
  490. #ifndef COMPLEX
  491. #ifdef XDOUBLE
  492. mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
  493. #elif defined(DOUBLE)
  494. mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
  495. #else
  496. mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
  497. #endif
  498. #else
  499. #ifdef XDOUBLE
  500. mode = BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE;
  501. #elif defined(DOUBLE)
  502. mode = BLAS_DOUBLE | BLAS_COMPLEX | BLAS_NODE;
  503. #else
  504. mode = BLAS_SINGLE | BLAS_COMPLEX | BLAS_NODE;
  505. #endif
  506. #endif
  507. #ifndef USE_OPENMP
  508. #ifndef OS_WINDOWS
  509. pthread_mutex_lock(&level3_lock);
  510. #else
  511. EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
  512. #endif
  513. #endif
  514. #ifdef USE_ALLOC_HEAP
  515. /* Dynamically allocate workspace */
  516. job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  517. if(job==NULL){
  518. fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
  519. exit(1);
  520. }
  521. #endif
  522. /* Initialize struct for arguments */
  523. newarg.m = args -> m;
  524. newarg.n = args -> n;
  525. newarg.k = args -> k;
  526. newarg.a = args -> a;
  527. newarg.b = args -> b;
  528. newarg.c = args -> c;
  529. newarg.lda = args -> lda;
  530. newarg.ldb = args -> ldb;
  531. newarg.ldc = args -> ldc;
  532. newarg.alpha = args -> alpha;
  533. newarg.beta = args -> beta;
  534. newarg.nthreads = args -> nthreads;
  535. newarg.common = (void *)job;
  536. #ifdef PARAMTEST
  537. newarg.gemm_p = args -> gemm_p;
  538. newarg.gemm_q = args -> gemm_q;
  539. newarg.gemm_r = args -> gemm_r;
  540. #endif
  541. /* Initialize partitions in m and n
  542. * Note: The number of CPU partitions is stored in the -1 entry */
  543. range_M = &range_M_buffer[1];
  544. range_N = &range_N_buffer[1];
  545. range_M[-1] = nthreads_m;
  546. range_N[-1] = nthreads_n;
  547. if (!range_m) {
  548. range_M[0] = 0;
  549. m = args -> m;
  550. } else {
  551. range_M[0] = range_m[0];
  552. m = range_m[1] - range_m[0];
  553. }
  554. /* Partition m into nthreads_m regions */
  555. num_parts = 0;
  556. while (m > 0){
  557. width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
  558. width = round_up(m, width, GEMM_PREFERED_SIZE);
  559. m -= width;
  560. if (m < 0) width = width + m;
  561. range_M[num_parts + 1] = range_M[num_parts] + width;
  562. num_parts ++;
  563. }
  564. for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
  565. range_M[i + 1] = range_M[num_parts];
  566. }
  567. /* Initialize parameters for parallel execution */
  568. for (i = 0; i < nthreads; i++) {
  569. queue[i].mode = mode;
  570. queue[i].routine = inner_thread;
  571. queue[i].args = &newarg;
  572. queue[i].range_m = range_M;
  573. queue[i].range_n = range_N;
  574. queue[i].sa = NULL;
  575. queue[i].sb = NULL;
  576. queue[i].next = &queue[i + 1];
  577. }
  578. queue[0].sa = sa;
  579. queue[0].sb = sb;
  580. queue[nthreads - 1].next = NULL;
  581. /* Iterate through steps of n */
  582. if (!range_n) {
  583. n_from = 0;
  584. n_to = args -> n;
  585. } else {
  586. n_from = range_n[0];
  587. n_to = range_n[1];
  588. }
  589. for(js = n_from; js < n_to; js += GEMM_R * nthreads){
  590. n = n_to - js;
  591. if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
  592. /* Partition (a step of) n into nthreads regions */
  593. range_N[0] = js;
  594. num_parts = 0;
  595. while (n > 0){
  596. width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
  597. if (width < SWITCH_RATIO) {
  598. width = SWITCH_RATIO;
  599. }
  600. width = round_up(n, width, GEMM_PREFERED_SIZE);
  601. n -= width;
  602. if (n < 0) width = width + n;
  603. range_N[num_parts + 1] = range_N[num_parts] + width;
  604. num_parts ++;
  605. }
  606. for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
  607. range_N[j + 1] = range_N[num_parts];
  608. }
  609. /* Clear synchronization flags */
  610. for (i = 0; i < nthreads; i++) {
  611. for (j = 0; j < nthreads; j++) {
  612. for (k = 0; k < DIVIDE_RATE; k++) {
  613. job[i].working[j][CACHE_LINE_SIZE * k] = 0;
  614. }
  615. }
  616. }
  617. WMB;
  618. /* Execute parallel computation */
  619. exec_blas(nthreads, queue);
  620. }
  621. #ifdef USE_ALLOC_HEAP
  622. free(job);
  623. #endif
  624. #ifndef USE_OPENMP
  625. #ifndef OS_WINDOWS
  626. pthread_mutex_unlock(&level3_lock);
  627. #else
  628. LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
  629. #endif
  630. #endif
  631. return 0;
  632. }
  633. int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){
  634. BLASLONG m = args -> m;
  635. BLASLONG n = args -> n;
  636. BLASLONG nthreads_m, nthreads_n;
  637. /* Get dimensions from index ranges if available */
  638. if (range_m) {
  639. m = range_m[1] - range_m[0];
  640. }
  641. if (range_n) {
  642. n = range_n[1] - range_n[0];
  643. }
  644. /* Partitions in m should have at least SWITCH_RATIO rows */
  645. if (m < 2 * SWITCH_RATIO) {
  646. nthreads_m = 1;
  647. } else {
  648. nthreads_m = args -> nthreads;
  649. while (m < nthreads_m * SWITCH_RATIO) {
  650. nthreads_m = nthreads_m / 2;
  651. }
  652. }
  653. /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
  654. if (n < SWITCH_RATIO * nthreads_m) {
  655. nthreads_n = 1;
  656. } else {
  657. nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
  658. if (nthreads_m * nthreads_n > args -> nthreads) {
  659. nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
  660. }
  661. }
  662. /* Execute serial or parallel computation */
  663. if (nthreads_m * nthreads_n <= 1) {
  664. GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
  665. } else {
  666. args -> nthreads = nthreads_m * nthreads_n;
  667. gemm_driver(args, range_m, range_n, sa, sb, nthreads_m, nthreads_n);
  668. }
  669. return 0;
  670. }