You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

level3_thread.c 24 kB

Only initialize the part of the jobs array that will get used The jobs array is getting initialized in O(compiled cpus^2) complexity. Distros and people with bigger systems will use pretty high values (128 or 256 or more) for this value, leading to interesting bubbles in performance. Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle in the interesting range (threading kicks in at 65x65 mult by 65x65). The hardware is capable of 32 multiplications per cycle theoretically. Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10703.9 10.6 0.0% 17990.6 6.3 0.0% 64 x 64 20778.4 12.8 0.0% 40629.2 6.5 0.0% 65 x 65 26869.9 10.3 0.0% 52545.7 5.3 0.0% 80 x 80 38104.5 13.5 0.0% 72492.7 7.1 0.0% 96 x 96 61626.4 14.4 0.0% 113983.8 7.8 0.0% 112 x 112 91803.8 15.3 0.0% 180987.3 7.8 0.0% 128 x 128 133161.4 15.8 0.0% 258374.3 8.1 0.0% When threading is turned on TARGET=SKYLAKEX F_COMPILER=GFORTRAN SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0 NUM_THREADS=128 Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10725.9 10.5 -0.2% 18134.9 6.2 -0.8% 64 x 64 20500.6 12.9 1.3% 40929.1 6.5 -0.7% 65 x 65 2040832.1 0.1 -7495.2% 2097633.6 0.1 -3892.0% 80 x 80 2063129.1 0.2 -5314.4% 2119925.2 0.2 -2824.3% 96 x 96 2070374.5 0.4 -3259.6% 2173604.4 0.4 -1806.9% 112 x 112 2111721.5 0.7 -2169.6% 2263330.8 0.6 -1170.0% 128 x 128 2276181.5 0.9 -1609.3% 2377228.9 0.9 -820.1% There is a deep deep cliff once you hit 65x65 With this patch Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% The cliff is very significantly reduced. (more to follow)
7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #ifndef CACHE_LINE_SIZE
  39. #define CACHE_LINE_SIZE 8
  40. #endif
  41. #ifndef DIVIDE_RATE
  42. #define DIVIDE_RATE 2
  43. #endif
  44. #ifndef SWITCH_RATIO
  45. #define SWITCH_RATIO 2
  46. #endif
  47. #ifndef GEMM_PREFERED_SIZE
  48. #define GEMM_PREFERED_SIZE 1
  49. #endif
  50. //The array of job_t may overflow the stack.
  51. //Instead, use malloc to alloc job_t.
  52. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
  53. #define USE_ALLOC_HEAP
  54. #endif
  55. #ifndef GEMM_LOCAL
  56. #if defined(NN)
  57. #define GEMM_LOCAL GEMM_NN
  58. #elif defined(NT)
  59. #define GEMM_LOCAL GEMM_NT
  60. #elif defined(NR)
  61. #define GEMM_LOCAL GEMM_NR
  62. #elif defined(NC)
  63. #define GEMM_LOCAL GEMM_NC
  64. #elif defined(TN)
  65. #define GEMM_LOCAL GEMM_TN
  66. #elif defined(TT)
  67. #define GEMM_LOCAL GEMM_TT
  68. #elif defined(TR)
  69. #define GEMM_LOCAL GEMM_TR
  70. #elif defined(TC)
  71. #define GEMM_LOCAL GEMM_TC
  72. #elif defined(RN)
  73. #define GEMM_LOCAL GEMM_RN
  74. #elif defined(RT)
  75. #define GEMM_LOCAL GEMM_RT
  76. #elif defined(RR)
  77. #define GEMM_LOCAL GEMM_RR
  78. #elif defined(RC)
  79. #define GEMM_LOCAL GEMM_RC
  80. #elif defined(CN)
  81. #define GEMM_LOCAL GEMM_CN
  82. #elif defined(CT)
  83. #define GEMM_LOCAL GEMM_CT
  84. #elif defined(CR)
  85. #define GEMM_LOCAL GEMM_CR
  86. #elif defined(CC)
  87. #define GEMM_LOCAL GEMM_CC
  88. #endif
  89. #endif
  90. typedef struct {
  91. volatile
  92. BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
  93. } job_t;
  94. #ifndef BETA_OPERATION
  95. #ifndef COMPLEX
  96. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  97. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  98. BETA[0], NULL, 0, NULL, 0, \
  99. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  100. #else
  101. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  102. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  103. BETA[0], BETA[1], NULL, 0, NULL, 0, \
  104. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  105. #endif
  106. #endif
  107. #ifndef ICOPY_OPERATION
  108. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  109. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  110. #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  111. #else
  112. #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  113. #endif
  114. #endif
  115. #ifndef OCOPY_OPERATION
  116. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  117. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  118. #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  119. #else
  120. #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  121. #endif
  122. #endif
  123. #ifndef KERNEL_FUNC
  124. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  125. #define KERNEL_FUNC GEMM_KERNEL_N
  126. #endif
  127. #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
  128. #define KERNEL_FUNC GEMM_KERNEL_L
  129. #endif
  130. #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
  131. #define KERNEL_FUNC GEMM_KERNEL_R
  132. #endif
  133. #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
  134. #define KERNEL_FUNC GEMM_KERNEL_B
  135. #endif
  136. #endif
  137. #ifndef KERNEL_OPERATION
  138. #ifndef COMPLEX
  139. #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
  140. KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  141. #else
  142. #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
  143. KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  144. #endif
  145. #endif
  146. #ifndef FUSED_KERNEL_OPERATION
  147. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  148. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  149. #ifndef COMPLEX
  150. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  151. FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \
  152. (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  153. #else
  154. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  155. FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \
  156. (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  157. #endif
  158. #else
  159. #ifndef COMPLEX
  160. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  161. FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \
  162. (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  163. #else
  164. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  165. FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \
  166. (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  167. #endif
  168. #endif
  169. #endif
  170. #ifndef A
  171. #define A args -> a
  172. #endif
  173. #ifndef LDA
  174. #define LDA args -> lda
  175. #endif
  176. #ifndef B
  177. #define B args -> b
  178. #endif
  179. #ifndef LDB
  180. #define LDB args -> ldb
  181. #endif
  182. #ifndef C
  183. #define C args -> c
  184. #endif
  185. #ifndef LDC
  186. #define LDC args -> ldc
  187. #endif
  188. #ifndef M
  189. #define M args -> m
  190. #endif
  191. #ifndef N
  192. #define N args -> n
  193. #endif
  194. #ifndef K
  195. #define K args -> k
  196. #endif
  197. #ifdef TIMING
  198. #define START_RPCC() rpcc_counter = rpcc()
  199. #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
  200. #else
  201. #define START_RPCC()
  202. #define STOP_RPCC(COUNTER)
  203. #endif
  204. static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  205. FLOAT *buffer[DIVIDE_RATE];
  206. BLASLONG k, lda, ldb, ldc;
  207. BLASLONG m_from, m_to, n_from, n_to;
  208. FLOAT *alpha, *beta;
  209. FLOAT *a, *b, *c;
  210. job_t *job = (job_t *)args -> common;
  211. BLASLONG nthreads_m;
  212. BLASLONG mypos_m, mypos_n;
  213. BLASLONG is, js, ls, bufferside, jjs;
  214. BLASLONG min_i, min_l, div_n, min_jj;
  215. BLASLONG i, current;
  216. BLASLONG l1stride;
  217. #ifdef TIMING
  218. BLASULONG rpcc_counter;
  219. BLASULONG copy_A = 0;
  220. BLASULONG copy_B = 0;
  221. BLASULONG kernel = 0;
  222. BLASULONG waiting1 = 0;
  223. BLASULONG waiting2 = 0;
  224. BLASULONG waiting3 = 0;
  225. BLASULONG waiting6[MAX_CPU_NUMBER];
  226. BLASULONG ops = 0;
  227. for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
  228. #endif
  229. k = K;
  230. a = (FLOAT *)A;
  231. b = (FLOAT *)B;
  232. c = (FLOAT *)C;
  233. lda = LDA;
  234. ldb = LDB;
  235. ldc = LDC;
  236. alpha = (FLOAT *)args -> alpha;
  237. beta = (FLOAT *)args -> beta;
  238. /* Initialize 2D CPU distribution */
  239. nthreads_m = args -> nthreads;
  240. if (range_m) {
  241. nthreads_m = range_m[-1];
  242. }
  243. mypos_n = blas_quickdivide(mypos, nthreads_m); /* mypos_n = mypos / nthreads_m */
  244. mypos_m = mypos - mypos_n * nthreads_m; /* mypos_m = mypos % nthreads_m */
  245. /* Initialize m and n */
  246. m_from = 0;
  247. m_to = M;
  248. if (range_m) {
  249. m_from = range_m[mypos_m + 0];
  250. m_to = range_m[mypos_m + 1];
  251. }
  252. n_from = 0;
  253. n_to = N;
  254. if (range_n) {
  255. n_from = range_n[mypos + 0];
  256. n_to = range_n[mypos + 1];
  257. }
  258. /* Multiply C by beta if needed */
  259. if (beta) {
  260. #ifndef COMPLEX
  261. if (beta[0] != ONE)
  262. #else
  263. if ((beta[0] != ONE) || (beta[1] != ZERO))
  264. #endif
  265. BETA_OPERATION(m_from, m_to, range_n[mypos_n * nthreads_m], range_n[(mypos_n + 1) * nthreads_m], beta, c, ldc);
  266. }
  267. /* Return early if no more computation is needed */
  268. if ((k == 0) || (alpha == NULL)) return 0;
  269. if (alpha[0] == ZERO
  270. #ifdef COMPLEX
  271. && alpha[1] == ZERO
  272. #endif
  273. ) return 0;
  274. /* Initialize workspace for local region of B */
  275. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  276. buffer[0] = sb;
  277. for (i = 1; i < DIVIDE_RATE; i++) {
  278. buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
  279. }
  280. /* Iterate through steps of k */
  281. for(ls = 0; ls < k; ls += min_l){
  282. /* Determine step size in k */
  283. min_l = k - ls;
  284. if (min_l >= GEMM_Q * 2) {
  285. min_l = GEMM_Q;
  286. } else {
  287. if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
  288. }
  289. /* Determine step size in m
  290. * Note: We are currently on the first step in m
  291. */
  292. l1stride = 1;
  293. min_i = m_to - m_from;
  294. if (min_i >= GEMM_P * 2) {
  295. min_i = GEMM_P;
  296. } else {
  297. if (min_i > GEMM_P) {
  298. min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
  299. } else {
  300. if (args -> nthreads == 1) l1stride = 0;
  301. }
  302. }
  303. /* Copy local region of A into workspace */
  304. START_RPCC();
  305. ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  306. STOP_RPCC(copy_A);
  307. /* Copy local region of B into workspace and apply kernel */
  308. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  309. for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
  310. /* Make sure if no one is using workspace */
  311. START_RPCC();
  312. for (i = 0; i < args -> nthreads; i++)
  313. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
  314. STOP_RPCC(waiting1);
  315. #if defined(FUSED_GEMM) && !defined(TIMING)
  316. /* Fused operation to copy region of B into workspace and apply kernel */
  317. FUSED_KERNEL_OPERATION(min_i, MIN(n_to, js + div_n) - js, min_l, alpha,
  318. sa, buffer[bufferside], b, ldb, c, ldc, m_from, js, ls);
  319. #else
  320. /* Split local region of B into parts */
  321. for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
  322. min_jj = MIN(n_to, js + div_n) - jjs;
  323. if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
  324. else
  325. if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
  326. else
  327. if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
  328. /* Copy part of local region of B into workspace */
  329. START_RPCC();
  330. OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
  331. buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride);
  332. STOP_RPCC(copy_B);
  333. /* Apply kernel with local region of A and part of local region of B */
  334. START_RPCC();
  335. KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
  336. sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride,
  337. c, ldc, m_from, jjs);
  338. STOP_RPCC(kernel);
  339. #ifdef TIMING
  340. ops += 2 * min_i * min_jj * min_l;
  341. #endif
  342. }
  343. #endif
  344. /* Set flag so other threads can access local region of B */
  345. for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
  346. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  347. WMB;
  348. }
  349. /* Get regions of B from other threads and apply kernel */
  350. current = mypos;
  351. do {
  352. /* This thread accesses regions of B from threads in the range
  353. * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */
  354. current ++;
  355. if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;
  356. /* Split other region of B into parts */
  357. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  358. for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
  359. if (current != mypos) {
  360. /* Wait until other region of B is initialized */
  361. START_RPCC();
  362. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
  363. STOP_RPCC(waiting2);
  364. /* Apply kernel with local region of A and part of other region of B */
  365. START_RPCC();
  366. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
  367. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  368. c, ldc, m_from, js);
  369. STOP_RPCC(kernel);
  370. #ifdef TIMING
  371. ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
  372. #endif
  373. }
  374. /* Clear synchronization flag if this thread is done with other region of B */
  375. if (m_to - m_from == min_i) {
  376. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  377. WMB;
  378. }
  379. }
  380. } while (current != mypos);
  381. /* Iterate through steps of m
  382. * Note: First step has already been finished */
  383. for(is = m_from + min_i; is < m_to; is += min_i){
  384. min_i = m_to - is;
  385. if (min_i >= GEMM_P * 2) {
  386. min_i = GEMM_P;
  387. } else
  388. if (min_i > GEMM_P) {
  389. min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
  390. }
  391. /* Copy local region of A into workspace */
  392. START_RPCC();
  393. ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  394. STOP_RPCC(copy_A);
  395. /* Get regions of B and apply kernel */
  396. current = mypos;
  397. do {
  398. /* Split region of B into parts and apply kernel */
  399. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  400. for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
  401. /* Apply kernel with local region of A and part of region of B */
  402. START_RPCC();
  403. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
  404. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  405. c, ldc, is, js);
  406. STOP_RPCC(kernel);
  407. #ifdef TIMING
  408. ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
  409. #endif
  410. /* Clear synchronization flag if this thread is done with region of B */
  411. if (is + min_i >= m_to) {
  412. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  413. WMB;
  414. }
  415. }
  416. /* This thread accesses regions of B from threads in the range
  417. * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */
  418. current ++;
  419. if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;
  420. } while (current != mypos);
  421. }
  422. }
  423. /* Wait until all other threads are done with local region of B */
  424. START_RPCC();
  425. for (i = 0; i < args -> nthreads; i++) {
  426. for (js = 0; js < DIVIDE_RATE; js++) {
  427. while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
  428. }
  429. }
  430. STOP_RPCC(waiting3);
  431. #ifdef TIMING
  432. BLASLONG waiting = waiting1 + waiting2 + waiting3;
  433. BLASLONG total = copy_A + copy_B + kernel + waiting;
  434. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
  435. mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
  436. (double)waiting1 /(double)total * 100.,
  437. (double)waiting2 /(double)total * 100.,
  438. (double)waiting3 /(double)total * 100.,
  439. (double)ops/(double)kernel / 4. * 100.);
  440. fprintf(stderr, "\n");
  441. #endif
  442. return 0;
  443. }
  444. static int round_up(int remainder, int width, int multiple)
  445. {
  446. if (multiple > remainder || width <= multiple)
  447. return width;
  448. width = (width + multiple - 1) / multiple;
  449. width = width * multiple;
  450. return width;
  451. }
  452. static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  453. *range_n, FLOAT *sa, FLOAT *sb,
  454. BLASLONG nthreads_m, BLASLONG nthreads_n) {
  455. #ifndef USE_OPENMP
  456. #ifndef OS_WINDOWS
  457. static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
  458. #else
  459. CRITICAL_SECTION level3_lock;
  460. InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
  461. #endif
  462. #endif
  463. blas_arg_t newarg;
  464. #ifndef USE_ALLOC_HEAP
  465. job_t job[MAX_CPU_NUMBER];
  466. #else
  467. job_t * job = NULL;
  468. #endif
  469. blas_queue_t queue[MAX_CPU_NUMBER];
  470. BLASLONG range_M_buffer[MAX_CPU_NUMBER + 2];
  471. BLASLONG range_N_buffer[MAX_CPU_NUMBER + 2];
  472. BLASLONG *range_M, *range_N;
  473. BLASLONG num_parts;
  474. BLASLONG nthreads = args -> nthreads;
  475. BLASLONG width, i, j, k, js;
  476. BLASLONG m, n, n_from, n_to;
  477. int mode;
  478. /* Get execution mode */
  479. #ifndef COMPLEX
  480. #ifdef XDOUBLE
  481. mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
  482. #elif defined(DOUBLE)
  483. mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
  484. #else
  485. mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
  486. #endif
  487. #else
  488. #ifdef XDOUBLE
  489. mode = BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE;
  490. #elif defined(DOUBLE)
  491. mode = BLAS_DOUBLE | BLAS_COMPLEX | BLAS_NODE;
  492. #else
  493. mode = BLAS_SINGLE | BLAS_COMPLEX | BLAS_NODE;
  494. #endif
  495. #endif
  496. #ifndef USE_OPENMP
  497. #ifndef OS_WINDOWS
  498. pthread_mutex_lock(&level3_lock);
  499. #else
  500. EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
  501. #endif
  502. #endif
  503. #ifdef USE_ALLOC_HEAP
  504. /* Dynamically allocate workspace */
  505. job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  506. if(job==NULL){
  507. fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
  508. exit(1);
  509. }
  510. #endif
  511. /* Initialize struct for arguments */
  512. newarg.m = args -> m;
  513. newarg.n = args -> n;
  514. newarg.k = args -> k;
  515. newarg.a = args -> a;
  516. newarg.b = args -> b;
  517. newarg.c = args -> c;
  518. newarg.lda = args -> lda;
  519. newarg.ldb = args -> ldb;
  520. newarg.ldc = args -> ldc;
  521. newarg.alpha = args -> alpha;
  522. newarg.beta = args -> beta;
  523. newarg.nthreads = args -> nthreads;
  524. newarg.common = (void *)job;
  525. #ifdef PARAMTEST
  526. newarg.gemm_p = args -> gemm_p;
  527. newarg.gemm_q = args -> gemm_q;
  528. newarg.gemm_r = args -> gemm_r;
  529. #endif
  530. /* Initialize partitions in m and n
  531. * Note: The number of CPU partitions is stored in the -1 entry */
  532. range_M = &range_M_buffer[1];
  533. range_N = &range_N_buffer[1];
  534. range_M[-1] = nthreads_m;
  535. range_N[-1] = nthreads_n;
  536. if (!range_m) {
  537. range_M[0] = 0;
  538. m = args -> m;
  539. } else {
  540. range_M[0] = range_m[0];
  541. m = range_m[1] - range_m[0];
  542. }
  543. /* Partition m into nthreads_m regions */
  544. num_parts = 0;
  545. while (m > 0){
  546. width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
  547. width = round_up(m, width, GEMM_PREFERED_SIZE);
  548. m -= width;
  549. if (m < 0) width = width + m;
  550. range_M[num_parts + 1] = range_M[num_parts] + width;
  551. num_parts ++;
  552. }
  553. for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
  554. range_M[i + 1] = range_M[num_parts];
  555. }
  556. /* Initialize parameters for parallel execution */
  557. for (i = 0; i < nthreads; i++) {
  558. queue[i].mode = mode;
  559. queue[i].routine = inner_thread;
  560. queue[i].args = &newarg;
  561. queue[i].range_m = range_M;
  562. queue[i].range_n = range_N;
  563. queue[i].sa = NULL;
  564. queue[i].sb = NULL;
  565. queue[i].next = &queue[i + 1];
  566. }
  567. queue[0].sa = sa;
  568. queue[0].sb = sb;
  569. queue[nthreads - 1].next = NULL;
  570. /* Iterate through steps of n */
  571. if (!range_n) {
  572. n_from = 0;
  573. n_to = args -> n;
  574. } else {
  575. n_from = range_n[0];
  576. n_to = range_n[1];
  577. }
  578. for(js = n_from; js < n_to; js += GEMM_R * nthreads){
  579. n = n_to - js;
  580. if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
  581. /* Partition (a step of) n into nthreads regions */
  582. range_N[0] = js;
  583. num_parts = 0;
  584. while (n > 0){
  585. width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
  586. if (width < SWITCH_RATIO) {
  587. width = SWITCH_RATIO;
  588. }
  589. width = round_up(n, width, GEMM_PREFERED_SIZE);
  590. n -= width;
  591. if (n < 0) width = width + n;
  592. range_N[num_parts + 1] = range_N[num_parts] + width;
  593. num_parts ++;
  594. }
  595. for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
  596. range_N[j + 1] = range_N[num_parts];
  597. }
  598. /* Clear synchronization flags */
  599. for (i = 0; i < nthreads; i++) {
  600. for (j = 0; j < nthreads; j++) {
  601. for (k = 0; k < DIVIDE_RATE; k++) {
  602. job[i].working[j][CACHE_LINE_SIZE * k] = 0;
  603. }
  604. }
  605. }
  606. /* Execute parallel computation */
  607. exec_blas(nthreads, queue);
  608. }
  609. #ifdef USE_ALLOC_HEAP
  610. free(job);
  611. #endif
  612. #ifndef USE_OPENMP
  613. #ifndef OS_WINDOWS
  614. pthread_mutex_unlock(&level3_lock);
  615. #else
  616. LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
  617. #endif
  618. #endif
  619. return 0;
  620. }
  621. int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  622. BLASLONG m = args -> m;
  623. BLASLONG n = args -> n;
  624. BLASLONG nthreads_m, nthreads_n;
  625. /* Get dimensions from index ranges if available */
  626. if (range_m) {
  627. m = range_m[1] - range_m[0];
  628. }
  629. if (range_n) {
  630. n = range_n[1] - range_n[0];
  631. }
  632. /* Partitions in m should have at least SWITCH_RATIO rows */
  633. if (m < 2 * SWITCH_RATIO) {
  634. nthreads_m = 1;
  635. } else {
  636. nthreads_m = args -> nthreads;
  637. while (m < nthreads_m * SWITCH_RATIO) {
  638. nthreads_m = nthreads_m / 2;
  639. }
  640. }
  641. /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
  642. if (n < SWITCH_RATIO * nthreads_m) {
  643. nthreads_n = 1;
  644. } else {
  645. nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
  646. if (nthreads_m * nthreads_n > args -> nthreads) {
  647. nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
  648. }
  649. }
  650. /* Execute serial or parallel computation */
  651. if (nthreads_m * nthreads_n <= 1) {
  652. GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
  653. } else {
  654. args -> nthreads = nthreads_m * nthreads_n;
  655. gemm_driver(args, range_m, range_n, sa, sb, nthreads_m, nthreads_n);
  656. }
  657. return 0;
  658. }