You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

level3_thread.c 28 kB

Only initialize the part of the jobs array that will get used The jobs array is getting initialized in O(compiled cpus^2) complexity. Distros and people with bigger systems will use pretty high values (128 or 256 or more) for this value, leading to interesting bubbles in performance. Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle in the interesting range (threading kicks in at 65x65 mult by 65x65). The hardware is capable of 32 multiplications per cycle theoretically. Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10703.9 10.6 0.0% 17990.6 6.3 0.0% 64 x 64 20778.4 12.8 0.0% 40629.2 6.5 0.0% 65 x 65 26869.9 10.3 0.0% 52545.7 5.3 0.0% 80 x 80 38104.5 13.5 0.0% 72492.7 7.1 0.0% 96 x 96 61626.4 14.4 0.0% 113983.8 7.8 0.0% 112 x 112 91803.8 15.3 0.0% 180987.3 7.8 0.0% 128 x 128 133161.4 15.8 0.0% 258374.3 8.1 0.0% When threading is turned on TARGET=SKYLAKEX F_COMPILER=GFORTRAN SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0 NUM_THREADS=128 Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10725.9 10.5 -0.2% 18134.9 6.2 -0.8% 64 x 64 20500.6 12.9 1.3% 40929.1 6.5 -0.7% 65 x 65 2040832.1 0.1 -7495.2% 2097633.6 0.1 -3892.0% 80 x 80 2063129.1 0.2 -5314.4% 2119925.2 0.2 -2824.3% 96 x 96 2070374.5 0.4 -3259.6% 2173604.4 0.4 -1806.9% 112 x 112 2111721.5 0.7 -2169.6% 2263330.8 0.6 -1170.0% 128 x 128 2276181.5 0.9 -1609.3% 2377228.9 0.9 -820.1% There is a deep deep cliff once you hit 65x65 With this patch Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% The cliff is very significantly reduced. (more to follow)
7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* Copyright 2023, 2025 The OpenBLAS Project. */
  4. /* All rights reserved. */
  5. /* */
  6. /* Redistribution and use in source and binary forms, with or */
  7. /* without modification, are permitted provided that the following */
  8. /* conditions are met: */
  9. /* */
  10. /* 1. Redistributions of source code must retain the above */
  11. /* copyright notice, this list of conditions and the following */
  12. /* disclaimer. */
  13. /* */
  14. /* 2. Redistributions in binary form must reproduce the above */
  15. /* copyright notice, this list of conditions and the following */
  16. /* disclaimer in the documentation and/or other materials */
  17. /* provided with the distribution. */
  18. /* */
  19. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  20. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  21. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  22. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  23. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  24. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  25. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  26. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  27. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  28. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  29. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  30. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  31. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  32. /* POSSIBILITY OF SUCH DAMAGE. */
  33. /* */
  34. /* The views and conclusions contained in the software and */
  35. /* documentation are those of the authors and should not be */
  36. /* interpreted as representing official policies, either expressed */
  37. /* or implied, of The University of Texas at Austin. */
  38. /*********************************************************************/
  39. #ifndef CACHE_LINE_SIZE
  40. #define CACHE_LINE_SIZE 8
  41. #endif
  42. #ifndef DIVIDE_RATE
  43. #define DIVIDE_RATE 2
  44. #endif
  45. #ifndef GEMM_PREFERED_SIZE
  46. #define GEMM_PREFERED_SIZE 1
  47. #endif
  48. //The array of job_t may overflow the stack.
  49. //Instead, use malloc to alloc job_t.
  50. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
  51. #define USE_ALLOC_HEAP
  52. #endif
  53. #ifndef GEMM_LOCAL
  54. #if defined(NN)
  55. #define GEMM_LOCAL GEMM_NN
  56. #elif defined(NT)
  57. #define GEMM_LOCAL GEMM_NT
  58. #elif defined(NR)
  59. #define GEMM_LOCAL GEMM_NR
  60. #elif defined(NC)
  61. #define GEMM_LOCAL GEMM_NC
  62. #elif defined(TN)
  63. #define GEMM_LOCAL GEMM_TN
  64. #elif defined(TT)
  65. #define GEMM_LOCAL GEMM_TT
  66. #elif defined(TR)
  67. #define GEMM_LOCAL GEMM_TR
  68. #elif defined(TC)
  69. #define GEMM_LOCAL GEMM_TC
  70. #elif defined(RN)
  71. #define GEMM_LOCAL GEMM_RN
  72. #elif defined(RT)
  73. #define GEMM_LOCAL GEMM_RT
  74. #elif defined(RR)
  75. #define GEMM_LOCAL GEMM_RR
  76. #elif defined(RC)
  77. #define GEMM_LOCAL GEMM_RC
  78. #elif defined(CN)
  79. #define GEMM_LOCAL GEMM_CN
  80. #elif defined(CT)
  81. #define GEMM_LOCAL GEMM_CT
  82. #elif defined(CR)
  83. #define GEMM_LOCAL GEMM_CR
  84. #elif defined(CC)
  85. #define GEMM_LOCAL GEMM_CC
  86. #endif
  87. #endif
  88. typedef struct {
  89. volatile
  90. BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
  91. } job_t;
  92. #ifndef BETA_OPERATION
  93. #ifndef COMPLEX
  94. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  95. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  96. BETA[0], NULL, 0, NULL, 0, \
  97. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  98. #else
  99. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  100. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  101. BETA[0], BETA[1], NULL, 0, NULL, 0, \
  102. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  103. #endif
  104. #endif
  105. #ifndef ICOPY_OPERATION
  106. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  107. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  108. #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  109. #else
  110. #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  111. #endif
  112. #endif
  113. #ifndef OCOPY_OPERATION
  114. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  115. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  116. #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  117. #else
  118. #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  119. #endif
  120. #endif
  121. #ifndef KERNEL_FUNC
  122. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  123. #define KERNEL_FUNC GEMM_KERNEL_N
  124. #endif
  125. #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
  126. #define KERNEL_FUNC GEMM_KERNEL_L
  127. #endif
  128. #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
  129. #define KERNEL_FUNC GEMM_KERNEL_R
  130. #endif
  131. #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
  132. #define KERNEL_FUNC GEMM_KERNEL_B
  133. #endif
  134. #endif
  135. #ifndef KERNEL_OPERATION
  136. #ifndef COMPLEX
  137. #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
  138. KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  139. #else
  140. #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
  141. KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  142. #endif
  143. #endif
  144. #ifndef FUSED_KERNEL_OPERATION
  145. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  146. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  147. #ifndef COMPLEX
  148. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  149. FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \
  150. (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  151. #else
  152. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  153. FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \
  154. (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  155. #endif
  156. #else
  157. #ifndef COMPLEX
  158. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  159. FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \
  160. (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  161. #else
  162. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  163. FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \
  164. (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  165. #endif
  166. #endif
  167. #endif
  168. #ifndef A
  169. #define A args -> a
  170. #endif
  171. #ifndef LDA
  172. #define LDA args -> lda
  173. #endif
  174. #ifndef B
  175. #define B args -> b
  176. #endif
  177. #ifndef LDB
  178. #define LDB args -> ldb
  179. #endif
  180. #ifndef C
  181. #define C args -> c
  182. #endif
  183. #ifndef LDC
  184. #define LDC args -> ldc
  185. #endif
  186. #ifndef M
  187. #define M args -> m
  188. #endif
  189. #ifndef N
  190. #define N args -> n
  191. #endif
  192. #ifndef K
  193. #define K args -> k
  194. #endif
  195. #ifdef TIMING
  196. #define START_RPCC() rpcc_counter = rpcc()
  197. #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
  198. #else
  199. #define START_RPCC()
  200. #define STOP_RPCC(COUNTER)
  201. #endif
  202. #if defined(BUILD_BFLOAT16)
  203. #if defined(DYNAMIC_ARCH)
  204. #if defined(BGEMM)
  205. #define BFLOAT16_ALIGN_K gotoblas->bgemm_align_k
  206. #else
  207. #define BFLOAT16_ALIGN_K gotoblas->sbgemm_align_k
  208. #endif
  209. #else
  210. #if defined(BGEMM)
  211. #define BFLOAT16_ALIGN_K BGEMM_ALIGN_K
  212. #else
  213. #define BFLOAT16_ALIGN_K SBGEMM_ALIGN_K
  214. #endif
  215. #endif
  216. #endif
  217. static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){
  218. IFLOAT *buffer[DIVIDE_RATE];
  219. BLASLONG k, lda, ldb, ldc;
  220. BLASLONG m_from, m_to, n_from, n_to;
  221. FLOAT *alpha, *beta;
  222. IFLOAT *a, *b;
  223. FLOAT *c;
  224. job_t *job = (job_t *)args -> common;
  225. BLASLONG nthreads_m;
  226. BLASLONG mypos_m, mypos_n;
  227. BLASLONG is, js, ls, bufferside, jjs;
  228. BLASLONG min_i, min_l, div_n, min_jj;
  229. BLASLONG i, current;
  230. BLASLONG l1stride;
  231. #ifdef TIMING
  232. BLASULONG rpcc_counter;
  233. BLASULONG copy_A = 0;
  234. BLASULONG copy_B = 0;
  235. BLASULONG kernel = 0;
  236. BLASULONG waiting1 = 0;
  237. BLASULONG waiting2 = 0;
  238. BLASULONG waiting3 = 0;
  239. BLASULONG waiting6[MAX_CPU_NUMBER];
  240. BLASULONG ops = 0;
  241. for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
  242. #endif
  243. k = K;
  244. a = (IFLOAT *)A;
  245. b = (IFLOAT *)B;
  246. c = (FLOAT *)C;
  247. lda = LDA;
  248. ldb = LDB;
  249. ldc = LDC;
  250. alpha = (FLOAT *)args -> alpha;
  251. beta = (FLOAT *)args -> beta;
  252. /* Initialize 2D CPU distribution */
  253. nthreads_m = args -> nthreads;
  254. if (range_m) {
  255. nthreads_m = range_m[-1];
  256. }
  257. mypos_n = blas_quickdivide(mypos, nthreads_m); /* mypos_n = mypos / nthreads_m */
  258. mypos_m = mypos - mypos_n * nthreads_m; /* mypos_m = mypos % nthreads_m */
  259. /* Initialize m and n */
  260. m_from = 0;
  261. m_to = M;
  262. if (range_m) {
  263. m_from = range_m[mypos_m + 0];
  264. m_to = range_m[mypos_m + 1];
  265. }
  266. n_from = 0;
  267. n_to = N;
  268. if (range_n) {
  269. n_from = range_n[mypos + 0];
  270. n_to = range_n[mypos + 1];
  271. }
  272. /* Multiply C by beta if needed */
  273. if (beta) {
  274. #ifndef COMPLEX
  275. if (beta[0] != ONE)
  276. #else
  277. if ((beta[0] != ONE) || (beta[1] != ZERO))
  278. #endif
  279. BETA_OPERATION(m_from, m_to, range_n[mypos_n * nthreads_m], range_n[(mypos_n + 1) * nthreads_m], beta, c, ldc);
  280. }
  281. /* Return early if no more computation is needed */
  282. if ((k == 0) || (alpha == NULL)) return 0;
  283. if (alpha[0] == ZERO
  284. #ifdef COMPLEX
  285. && alpha[1] == ZERO
  286. #endif
  287. ) return 0;
  288. /* Initialize workspace for local region of B */
  289. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  290. buffer[0] = sb;
  291. for (i = 1; i < DIVIDE_RATE; i++) {
  292. buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
  293. }
  294. /* Iterate through steps of k */
  295. for(ls = 0; ls < k; ls += min_l){
  296. /* Determine step size in k */
  297. min_l = k - ls;
  298. if (min_l >= GEMM_Q * 2) {
  299. min_l = GEMM_Q;
  300. } else {
  301. if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
  302. }
  303. BLASLONG pad_min_l = min_l;
  304. #if defined(BFLOAT16)
  305. pad_min_l = (min_l + BFLOAT16_ALIGN_K - 1) & ~(BFLOAT16_ALIGN_K - 1);
  306. #endif
  307. /* Determine step size in m
  308. * Note: We are currently on the first step in m
  309. */
  310. l1stride = 1;
  311. min_i = m_to - m_from;
  312. if (min_i >= GEMM_P * 2) {
  313. min_i = GEMM_P;
  314. } else {
  315. if (min_i > GEMM_P) {
  316. min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
  317. } else {
  318. if (args -> nthreads == 1) l1stride = 0;
  319. }
  320. }
  321. /* Copy local region of A into workspace */
  322. START_RPCC();
  323. ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  324. STOP_RPCC(copy_A);
  325. /* Copy local region of B into workspace and apply kernel */
  326. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  327. for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
  328. /* Make sure if no one is using workspace */
  329. START_RPCC();
  330. for (i = 0; i < args -> nthreads; i++)
  331. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
  332. STOP_RPCC(waiting1);
  333. MB;
  334. #if defined(FUSED_GEMM) && !defined(TIMING)
  335. /* Fused operation to copy region of B into workspace and apply kernel */
  336. FUSED_KERNEL_OPERATION(min_i, MIN(n_to, js + div_n) - js, min_l, alpha,
  337. sa, buffer[bufferside], b, ldb, c, ldc, m_from, js, ls);
  338. #else
  339. /* Split local region of B into parts */
  340. for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
  341. min_jj = MIN(n_to, js + div_n) - jjs;
  342. #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
  343. /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
  344. if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
  345. #else
  346. if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
  347. else
  348. /*
  349. if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
  350. else
  351. */
  352. if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
  353. #endif
  354. /* Copy part of local region of B into workspace */
  355. START_RPCC();
  356. OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
  357. buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride);
  358. STOP_RPCC(copy_B);
  359. /* Apply kernel with local region of A and part of local region of B */
  360. START_RPCC();
  361. KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
  362. sa, buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride,
  363. c, ldc, m_from, jjs);
  364. STOP_RPCC(kernel);
  365. #ifdef TIMING
  366. ops += 2 * min_i * min_jj * min_l;
  367. #endif
  368. }
  369. #endif
  370. WMB;
  371. /* Set flag so other threads can access local region of B */
  372. for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
  373. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  374. }
  375. /* Get regions of B from other threads and apply kernel */
  376. current = mypos;
  377. do {
  378. /* This thread accesses regions of B from threads in the range
  379. * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */
  380. current ++;
  381. if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;
  382. /* Split other region of B into parts */
  383. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  384. for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
  385. if (current != mypos) {
  386. /* Wait until other region of B is initialized */
  387. START_RPCC();
  388. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
  389. STOP_RPCC(waiting2);
  390. MB;
  391. /* Apply kernel with local region of A and part of other region of B */
  392. START_RPCC();
  393. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
  394. sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  395. c, ldc, m_from, js);
  396. STOP_RPCC(kernel);
  397. #ifdef TIMING
  398. ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
  399. #endif
  400. }
  401. /* Clear synchronization flag if this thread is done with other region of B */
  402. if (m_to - m_from == min_i) {
  403. WMB;
  404. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  405. }
  406. }
  407. } while (current != mypos);
  408. /* Iterate through steps of m
  409. * Note: First step has already been finished */
  410. for(is = m_from + min_i; is < m_to; is += min_i){
  411. min_i = m_to - is;
  412. if (min_i >= GEMM_P * 2) {
  413. min_i = GEMM_P;
  414. } else
  415. if (min_i > GEMM_P) {
  416. min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
  417. }
  418. /* Copy local region of A into workspace */
  419. START_RPCC();
  420. ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  421. STOP_RPCC(copy_A);
  422. /* Get regions of B and apply kernel */
  423. current = mypos;
  424. do {
  425. /* Split region of B into parts and apply kernel */
  426. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  427. for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
  428. /* Apply kernel with local region of A and part of region of B */
  429. START_RPCC();
  430. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
  431. sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  432. c, ldc, is, js);
  433. STOP_RPCC(kernel);
  434. #ifdef TIMING
  435. ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
  436. #endif
  437. /* Clear synchronization flag if this thread is done with region of B */
  438. if (is + min_i >= m_to) {
  439. WMB;
  440. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  441. }
  442. }
  443. /* This thread accesses regions of B from threads in the range
  444. * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */
  445. current ++;
  446. if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;
  447. } while (current != mypos);
  448. }
  449. }
  450. /* Wait until all other threads are done with local region of B */
  451. START_RPCC();
  452. for (i = 0; i < args -> nthreads; i++) {
  453. for (js = 0; js < DIVIDE_RATE; js++) {
  454. while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
  455. }
  456. }
  457. STOP_RPCC(waiting3);
  458. MB;
  459. #ifdef TIMING
  460. BLASLONG waiting = waiting1 + waiting2 + waiting3;
  461. BLASLONG total = copy_A + copy_B + kernel + waiting;
  462. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
  463. mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
  464. (double)waiting1 /(double)total * 100.,
  465. (double)waiting2 /(double)total * 100.,
  466. (double)waiting3 /(double)total * 100.,
  467. (double)ops/(double)kernel / 4. * 100.);
  468. fprintf(stderr, "\n");
  469. #endif
  470. return 0;
  471. }
  472. static int round_up(int remainder, int width, int multiple)
  473. {
  474. if (multiple > remainder || width <= multiple)
  475. return width;
  476. width = (width + multiple - 1) / multiple;
  477. width = width * multiple;
  478. return width;
  479. }
  480. static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  481. *range_n, IFLOAT *sa, IFLOAT *sb,
  482. BLASLONG nthreads_m, BLASLONG nthreads_n) {
  483. #ifdef USE_OPENMP
  484. static omp_lock_t level3_lock, critical_section_lock;
  485. static volatile BLASULONG init_lock = 0, omp_lock_initialized = 0,
  486. parallel_section_left = MAX_PARALLEL_NUMBER;
  487. // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c
  488. while(omp_lock_initialized == 0)
  489. {
  490. blas_lock(&init_lock);
  491. {
  492. if(omp_lock_initialized == 0)
  493. {
  494. omp_init_lock(&level3_lock);
  495. omp_init_lock(&critical_section_lock);
  496. omp_lock_initialized = 1;
  497. WMB;
  498. }
  499. blas_unlock(&init_lock);
  500. }
  501. }
  502. #elif defined(OS_WINDOWS)
  503. CRITICAL_SECTION level3_lock;
  504. InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
  505. #else
  506. static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
  507. static pthread_cond_t level3_wakeup = PTHREAD_COND_INITIALIZER;
  508. volatile static BLASLONG CPU_AVAILABLE = MAX_CPU_NUMBER;
  509. #endif
  510. blas_arg_t newarg;
  511. #ifndef USE_ALLOC_HEAP
  512. job_t job[MAX_CPU_NUMBER];
  513. #else
  514. job_t * job = NULL;
  515. #endif
  516. blas_queue_t queue[MAX_CPU_NUMBER];
  517. BLASLONG range_M_buffer[MAX_CPU_NUMBER + 2];
  518. BLASLONG range_N_buffer[MAX_CPU_NUMBER + 2];
  519. BLASLONG *range_M, *range_N;
  520. BLASLONG num_parts;
  521. BLASLONG nthreads = args -> nthreads;
  522. BLASLONG width, width_n, i, j, k, js;
  523. BLASLONG m, n, n_from, n_to;
  524. int mode;
  525. #if defined(DYNAMIC_ARCH)
  526. int switch_ratio = gotoblas->switch_ratio;
  527. #else
  528. int switch_ratio = SWITCH_RATIO;
  529. #endif
  530. /* Get execution mode */
  531. #ifndef COMPLEX
  532. #ifdef XDOUBLE
  533. mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
  534. #elif defined(DOUBLE)
  535. mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
  536. #else
  537. mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
  538. #endif
  539. #else
  540. #ifdef XDOUBLE
  541. mode = BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE;
  542. #elif defined(DOUBLE)
  543. mode = BLAS_DOUBLE | BLAS_COMPLEX | BLAS_NODE;
  544. #else
  545. mode = BLAS_SINGLE | BLAS_COMPLEX | BLAS_NODE;
  546. #endif
  547. #endif
  548. #ifdef USE_OPENMP
  549. omp_set_lock(&level3_lock);
  550. omp_set_lock(&critical_section_lock);
  551. parallel_section_left--;
  552. /*
  553. How OpenMP locks works with NUM_PARALLEL
  554. 1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions
  555. 2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls
  556. 3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required
  557. 4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter
  558. */
  559. if(parallel_section_left != 0)
  560. omp_unset_lock(&level3_lock);
  561. omp_unset_lock(&critical_section_lock);
  562. #elif defined(OS_WINDOWS)
  563. EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
  564. #else
  565. pthread_mutex_lock(&level3_lock);
  566. while(CPU_AVAILABLE < nthreads) {
  567. pthread_cond_wait(&level3_wakeup, &level3_lock);
  568. }
  569. CPU_AVAILABLE -= nthreads;
  570. WMB;
  571. pthread_mutex_unlock(&level3_lock);
  572. #endif
  573. #ifdef USE_ALLOC_HEAP
  574. /* Dynamically allocate workspace */
  575. job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  576. if(job==NULL){
  577. fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
  578. exit(1);
  579. }
  580. #endif
  581. /* Initialize struct for arguments */
  582. newarg.m = args -> m;
  583. newarg.n = args -> n;
  584. newarg.k = args -> k;
  585. newarg.a = args -> a;
  586. newarg.b = args -> b;
  587. newarg.c = args -> c;
  588. newarg.lda = args -> lda;
  589. newarg.ldb = args -> ldb;
  590. newarg.ldc = args -> ldc;
  591. newarg.alpha = args -> alpha;
  592. newarg.beta = args -> beta;
  593. newarg.nthreads = args -> nthreads;
  594. newarg.common = (void *)job;
  595. #ifdef PARAMTEST
  596. newarg.gemm_p = args -> gemm_p;
  597. newarg.gemm_q = args -> gemm_q;
  598. newarg.gemm_r = args -> gemm_r;
  599. #endif
  600. /* Initialize partitions in m and n
  601. * Note: The number of CPU partitions is stored in the -1 entry */
  602. range_M = &range_M_buffer[1];
  603. range_N = &range_N_buffer[1];
  604. range_M[-1] = nthreads_m;
  605. range_N[-1] = nthreads_n;
  606. if (!range_m) {
  607. range_M[0] = 0;
  608. m = args -> m;
  609. } else {
  610. range_M[0] = range_m[0];
  611. m = range_m[1] - range_m[0];
  612. }
  613. /* Partition m into nthreads_m regions */
  614. num_parts = 0;
  615. while (m > 0){
  616. width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
  617. width = round_up(m, width, GEMM_PREFERED_SIZE);
  618. m -= width;
  619. if (m < 0) width = width + m;
  620. range_M[num_parts + 1] = range_M[num_parts] + width;
  621. num_parts ++;
  622. }
  623. for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
  624. range_M[i + 1] = range_M[num_parts];
  625. }
  626. /* Initialize parameters for parallel execution */
  627. for (i = 0; i < nthreads; i++) {
  628. queue[i].mode = mode;
  629. queue[i].routine = inner_thread;
  630. queue[i].args = &newarg;
  631. queue[i].range_m = range_M;
  632. queue[i].range_n = range_N;
  633. queue[i].sa = NULL;
  634. queue[i].sb = NULL;
  635. queue[i].next = &queue[i + 1];
  636. }
  637. queue[0].sa = sa;
  638. queue[0].sb = sb;
  639. queue[nthreads - 1].next = NULL;
  640. /* Iterate through steps of n */
  641. if (!range_n) {
  642. n_from = 0;
  643. n_to = args -> n;
  644. } else {
  645. n_from = range_n[0];
  646. n_to = range_n[1];
  647. }
  648. for(js = n_from; js < n_to; js += GEMM_R * nthreads){
  649. n = n_to - js;
  650. if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
  651. /* Partition (a step of) n into nthreads regions */
  652. range_N[0] = js;
  653. num_parts = 0;
  654. for(j = 0; j < nthreads_n; j++){
  655. width_n = blas_quickdivide(n + nthreads_n - j - 1, nthreads_n - j);
  656. n -= width_n;
  657. for(i = 0; i < nthreads_m; i++){
  658. width = blas_quickdivide(width_n + nthreads_m - i - 1, nthreads_m - i);
  659. if (width < switch_ratio) {
  660. width = switch_ratio;
  661. }
  662. width = round_up(width_n, width, GEMM_PREFERED_SIZE);
  663. width_n -= width;
  664. if (width_n < 0) {
  665. width = width + width_n;
  666. width_n = 0;
  667. }
  668. range_N[num_parts + 1] = range_N[num_parts] + width;
  669. num_parts ++;
  670. }
  671. }
  672. for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
  673. range_N[j + 1] = range_N[num_parts];
  674. }
  675. /* Clear synchronization flags */
  676. for (i = 0; i < nthreads; i++) {
  677. for (j = 0; j < nthreads; j++) {
  678. for (k = 0; k < DIVIDE_RATE; k++) {
  679. job[i].working[j][CACHE_LINE_SIZE * k] = 0;
  680. }
  681. }
  682. }
  683. WMB;
  684. /* Execute parallel computation */
  685. exec_blas(nthreads, queue);
  686. }
  687. #ifdef USE_ALLOC_HEAP
  688. free(job);
  689. #endif
  690. #ifdef USE_OPENMP
  691. omp_set_lock(&critical_section_lock);
  692. parallel_section_left++;
  693. /*
  694. Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call
  695. otherwise just increment the parallel_section_left
  696. The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count
  697. */
  698. if(parallel_section_left == 1)
  699. omp_unset_lock(&level3_lock);
  700. omp_unset_lock(&critical_section_lock);
  701. #elif defined(OS_WINDOWS)
  702. LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
  703. #else
  704. pthread_mutex_lock(&level3_lock);
  705. CPU_AVAILABLE += nthreads;
  706. WMB;
  707. pthread_cond_signal(&level3_wakeup);
  708. pthread_mutex_unlock(&level3_lock);
  709. #endif
  710. return 0;
  711. }
  712. int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){
  713. BLASLONG m = args -> m;
  714. BLASLONG n = args -> n;
  715. BLASLONG nthreads_m, nthreads_n;
  716. #if defined(DYNAMIC_ARCH)
  717. int switch_ratio = gotoblas->switch_ratio;
  718. #else
  719. int switch_ratio = SWITCH_RATIO;
  720. #endif
  721. /* Get dimensions from index ranges if available */
  722. if (range_m) {
  723. m = range_m[1] - range_m[0];
  724. }
  725. if (range_n) {
  726. n = range_n[1] - range_n[0];
  727. }
  728. /* Partitions in m should have at least switch_ratio rows */
  729. if (m < 2 * switch_ratio) {
  730. nthreads_m = 1;
  731. } else {
  732. nthreads_m = args -> nthreads;
  733. while (m < nthreads_m * switch_ratio) {
  734. nthreads_m = nthreads_m / 2;
  735. }
  736. }
  737. /* Partitions in n should have at most switch_ratio * nthreads_m columns */
  738. if (n < switch_ratio * nthreads_m) {
  739. nthreads_n = 1;
  740. } else {
  741. nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
  742. if (nthreads_m * nthreads_n > args -> nthreads) {
  743. nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
  744. }
  745. /* The nthreads_m and nthreads_n are adjusted so that the submatrix */
  746. /* to be handled by each thread preferably becomes a square matrix */
  747. /* by minimizing an objective function 'n * nthreads_m + m * nthreads_n'. */
  748. /* Objective function come from sum of partitions in m and n. */
  749. /* (n / nthreads_n) + (m / nthreads_m) */
  750. /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */
  751. BLASLONG cost = 0, div = 0;
  752. BLASLONG i;
  753. for (i = 1; i <= sqrt(nthreads_m); i++) {
  754. if (nthreads_m % i) continue;
  755. BLASLONG j = nthreads_m / i;
  756. BLASLONG cost_i = n * j + m * nthreads_n * i;
  757. BLASLONG cost_j = n * i + m * nthreads_n * j;
  758. if (cost == 0 ||
  759. cost_i < cost) {cost = cost_i; div = i;}
  760. if (cost_j < cost) {cost = cost_j; div = j;}
  761. }
  762. if (div > 1) {
  763. nthreads_m /= div;
  764. nthreads_n *= div;
  765. }
  766. }
  767. /* Execute serial or parallel computation */
  768. if (nthreads_m * nthreads_n <= 1) {
  769. GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
  770. } else {
  771. args -> nthreads = nthreads_m * nthreads_n;
  772. gemm_driver(args, range_m, range_n, sa, sb, nthreads_m, nthreads_n);
  773. }
  774. return 0;
  775. }