You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

level3_thread.c 28 kB

Only initialize the part of the jobs array that will get used The jobs array is getting initialized in O(compiled cpus^2) complexity. Distros and people with bigger systems will use pretty high values (128 or 256 or more) for this value, leading to interesting bubbles in performance. Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle in the interesting range (threading kicks in at 65x65 mult by 65x65). The hardware is capable of 32 multiplications per cycle theoretically. Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10703.9 10.6 0.0% 17990.6 6.3 0.0% 64 x 64 20778.4 12.8 0.0% 40629.2 6.5 0.0% 65 x 65 26869.9 10.3 0.0% 52545.7 5.3 0.0% 80 x 80 38104.5 13.5 0.0% 72492.7 7.1 0.0% 96 x 96 61626.4 14.4 0.0% 113983.8 7.8 0.0% 112 x 112 91803.8 15.3 0.0% 180987.3 7.8 0.0% 128 x 128 133161.4 15.8 0.0% 258374.3 8.1 0.0% When threading is turned on TARGET=SKYLAKEX F_COMPILER=GFORTRAN SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0 NUM_THREADS=128 Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10725.9 10.5 -0.2% 18134.9 6.2 -0.8% 64 x 64 20500.6 12.9 1.3% 40929.1 6.5 -0.7% 65 x 65 2040832.1 0.1 -7495.2% 2097633.6 0.1 -3892.0% 80 x 80 2063129.1 0.2 -5314.4% 2119925.2 0.2 -2824.3% 96 x 96 2070374.5 0.4 -3259.6% 2173604.4 0.4 -1806.9% 112 x 112 2111721.5 0.7 -2169.6% 2263330.8 0.6 -1170.0% 128 x 128 2276181.5 0.9 -1609.3% 2377228.9 0.9 -820.1% There is a deep deep cliff once you hit 65x65 With this patch Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% The cliff is very significantly reduced. (more to follow)
7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* Copyright 2023, 2025 The OpenBLAS Project. */
  4. /* All rights reserved. */
  5. /* */
  6. /* Redistribution and use in source and binary forms, with or */
  7. /* without modification, are permitted provided that the following */
  8. /* conditions are met: */
  9. /* */
  10. /* 1. Redistributions of source code must retain the above */
  11. /* copyright notice, this list of conditions and the following */
  12. /* disclaimer. */
  13. /* */
  14. /* 2. Redistributions in binary form must reproduce the above */
  15. /* copyright notice, this list of conditions and the following */
  16. /* disclaimer in the documentation and/or other materials */
  17. /* provided with the distribution. */
  18. /* */
  19. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  20. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  21. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  22. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  23. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  24. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  25. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  26. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  27. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  28. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  29. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  30. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  31. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  32. /* POSSIBILITY OF SUCH DAMAGE. */
  33. /* */
  34. /* The views and conclusions contained in the software and */
  35. /* documentation are those of the authors and should not be */
  36. /* interpreted as representing official policies, either expressed */
  37. /* or implied, of The University of Texas at Austin. */
  38. /*********************************************************************/
  39. #ifndef CACHE_LINE_SIZE
  40. #define CACHE_LINE_SIZE 8
  41. #endif
  42. #ifndef DIVIDE_RATE
  43. #define DIVIDE_RATE 2
  44. #endif
  45. #ifndef GEMM_PREFERED_SIZE
  46. #define GEMM_PREFERED_SIZE 1
  47. #endif
  48. //The array of job_t may overflow the stack.
  49. //Instead, use malloc to alloc job_t.
  50. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
  51. #define USE_ALLOC_HEAP
  52. #endif
  53. #ifndef GEMM_LOCAL
  54. #if defined(NN)
  55. #define GEMM_LOCAL GEMM_NN
  56. #elif defined(NT)
  57. #define GEMM_LOCAL GEMM_NT
  58. #elif defined(NR)
  59. #define GEMM_LOCAL GEMM_NR
  60. #elif defined(NC)
  61. #define GEMM_LOCAL GEMM_NC
  62. #elif defined(TN)
  63. #define GEMM_LOCAL GEMM_TN
  64. #elif defined(TT)
  65. #define GEMM_LOCAL GEMM_TT
  66. #elif defined(TR)
  67. #define GEMM_LOCAL GEMM_TR
  68. #elif defined(TC)
  69. #define GEMM_LOCAL GEMM_TC
  70. #elif defined(RN)
  71. #define GEMM_LOCAL GEMM_RN
  72. #elif defined(RT)
  73. #define GEMM_LOCAL GEMM_RT
  74. #elif defined(RR)
  75. #define GEMM_LOCAL GEMM_RR
  76. #elif defined(RC)
  77. #define GEMM_LOCAL GEMM_RC
  78. #elif defined(CN)
  79. #define GEMM_LOCAL GEMM_CN
  80. #elif defined(CT)
  81. #define GEMM_LOCAL GEMM_CT
  82. #elif defined(CR)
  83. #define GEMM_LOCAL GEMM_CR
  84. #elif defined(CC)
  85. #define GEMM_LOCAL GEMM_CC
  86. #endif
  87. #endif
  88. typedef struct {
  89. volatile
  90. BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
  91. } job_t;
  92. #ifndef BETA_OPERATION
  93. #ifndef COMPLEX
  94. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  95. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  96. BETA[0], NULL, 0, NULL, 0, \
  97. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  98. #else
  99. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  100. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  101. BETA[0], BETA[1], NULL, 0, NULL, 0, \
  102. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  103. #endif
  104. #endif
  105. #ifndef ICOPY_OPERATION
  106. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  107. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  108. #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  109. #else
  110. #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  111. #endif
  112. #endif
  113. #ifndef OCOPY_OPERATION
  114. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  115. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  116. #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  117. #else
  118. #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  119. #endif
  120. #endif
  121. #ifndef KERNEL_FUNC
  122. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  123. #define KERNEL_FUNC GEMM_KERNEL_N
  124. #endif
  125. #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
  126. #define KERNEL_FUNC GEMM_KERNEL_L
  127. #endif
  128. #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
  129. #define KERNEL_FUNC GEMM_KERNEL_R
  130. #endif
  131. #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
  132. #define KERNEL_FUNC GEMM_KERNEL_B
  133. #endif
  134. #endif
  135. #ifndef KERNEL_OPERATION
  136. #ifndef COMPLEX
  137. #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
  138. KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  139. #else
  140. #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
  141. KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  142. #endif
  143. #endif
  144. #ifndef FUSED_KERNEL_OPERATION
  145. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  146. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  147. #ifndef COMPLEX
  148. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  149. FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \
  150. (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  151. #else
  152. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  153. FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \
  154. (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  155. #endif
  156. #else
  157. #ifndef COMPLEX
  158. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  159. FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \
  160. (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  161. #else
  162. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  163. FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \
  164. (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  165. #endif
  166. #endif
  167. #endif
  168. #ifndef A
  169. #define A args -> a
  170. #endif
  171. #ifndef LDA
  172. #define LDA args -> lda
  173. #endif
  174. #ifndef B
  175. #define B args -> b
  176. #endif
  177. #ifndef LDB
  178. #define LDB args -> ldb
  179. #endif
  180. #ifndef C
  181. #define C args -> c
  182. #endif
  183. #ifndef LDC
  184. #define LDC args -> ldc
  185. #endif
  186. #ifndef M
  187. #define M args -> m
  188. #endif
  189. #ifndef N
  190. #define N args -> n
  191. #endif
  192. #ifndef K
  193. #define K args -> k
  194. #endif
  195. #ifdef TIMING
  196. #define START_RPCC() rpcc_counter = rpcc()
  197. #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
  198. #else
  199. #define START_RPCC()
  200. #define STOP_RPCC(COUNTER)
  201. #endif
  202. #if defined(BUILD_BFLOAT16)
  203. #if defined(DYNAMIC_ARCH)
  204. #if defined(BGEMM)
  205. #define BFLOAT16_ALIGN_K gotoblas->bgemm_align_k
  206. #else
  207. #define BFLOAT16_ALIGN_K gotoblas->sbgemm_align_k
  208. #endif
  209. #else
  210. #if defined(BGEMM)
  211. #define BFLOAT16_ALIGN_K BGEMM_ALIGN_K
  212. #else
  213. #define BFLOAT16_ALIGN_K SBGEMM_ALIGN_K
  214. #endif
  215. #endif
  216. #endif
  217. static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){
  218. IFLOAT *buffer[DIVIDE_RATE];
  219. BLASLONG k, lda, ldb, ldc;
  220. BLASLONG m_from, m_to, n_from, n_to;
  221. FLOAT *alpha, *beta;
  222. IFLOAT *a, *b;
  223. FLOAT *c;
  224. job_t *job = (job_t *)args -> common;
  225. BLASLONG nthreads_m;
  226. BLASLONG mypos_m, mypos_n;
  227. BLASLONG divide_rate = DIVIDE_RATE;
  228. BLASLONG is, js, ls, bufferside, jjs;
  229. BLASLONG min_i, min_l, div_n, min_jj;
  230. BLASLONG i, current;
  231. BLASLONG l1stride;
  232. #ifdef TIMING
  233. BLASULONG rpcc_counter;
  234. BLASULONG copy_A = 0;
  235. BLASULONG copy_B = 0;
  236. BLASULONG kernel = 0;
  237. BLASULONG waiting1 = 0;
  238. BLASULONG waiting2 = 0;
  239. BLASULONG waiting3 = 0;
  240. BLASULONG waiting6[MAX_CPU_NUMBER];
  241. BLASULONG ops = 0;
  242. for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
  243. #endif
  244. k = K;
  245. a = (IFLOAT *)A;
  246. b = (IFLOAT *)B;
  247. c = (FLOAT *)C;
  248. lda = LDA;
  249. ldb = LDB;
  250. ldc = LDC;
  251. alpha = (FLOAT *)args -> alpha;
  252. beta = (FLOAT *)args -> beta;
  253. /* Disable divide_rate when N of all threads are less than to DIVIDE_LIMIT */
  254. #ifdef DIVIDE_LIMIT
  255. if (N < DIVIDE_LIMIT) divide_rate = 1;
  256. #endif
  257. /* Initialize 2D CPU distribution */
  258. nthreads_m = args -> nthreads;
  259. if (range_m) {
  260. nthreads_m = range_m[-1];
  261. }
  262. mypos_n = blas_quickdivide(mypos, nthreads_m); /* mypos_n = mypos / nthreads_m */
  263. mypos_m = mypos - mypos_n * nthreads_m; /* mypos_m = mypos % nthreads_m */
  264. /* Initialize m and n */
  265. m_from = 0;
  266. m_to = M;
  267. if (range_m) {
  268. m_from = range_m[mypos_m + 0];
  269. m_to = range_m[mypos_m + 1];
  270. }
  271. n_from = 0;
  272. n_to = N;
  273. if (range_n) {
  274. n_from = range_n[mypos + 0];
  275. n_to = range_n[mypos + 1];
  276. }
  277. /* Multiply C by beta if needed */
  278. if (beta) {
  279. #ifndef COMPLEX
  280. if (beta[0] != ONE)
  281. #else
  282. if ((beta[0] != ONE) || (beta[1] != ZERO))
  283. #endif
  284. BETA_OPERATION(m_from, m_to, range_n[mypos_n * nthreads_m], range_n[(mypos_n + 1) * nthreads_m], beta, c, ldc);
  285. }
  286. /* Return early if no more computation is needed */
  287. if ((k == 0) || (alpha == NULL)) return 0;
  288. if (alpha[0] == ZERO
  289. #ifdef COMPLEX
  290. && alpha[1] == ZERO
  291. #endif
  292. ) return 0;
  293. /* Initialize workspace for local region of B */
  294. div_n = (n_to - n_from + divide_rate - 1) / divide_rate;
  295. buffer[0] = sb;
  296. for (i = 1; i < divide_rate; i++) {
  297. buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
  298. }
  299. /* Iterate through steps of k */
  300. for(ls = 0; ls < k; ls += min_l){
  301. /* Determine step size in k */
  302. min_l = k - ls;
  303. if (min_l >= GEMM_Q * 2) {
  304. min_l = GEMM_Q;
  305. } else {
  306. if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
  307. }
  308. BLASLONG pad_min_l = min_l;
  309. #if defined(BFLOAT16)
  310. pad_min_l = (min_l + BFLOAT16_ALIGN_K - 1) & ~(BFLOAT16_ALIGN_K - 1);
  311. #endif
  312. /* Determine step size in m
  313. * Note: We are currently on the first step in m
  314. */
  315. l1stride = 1;
  316. min_i = m_to - m_from;
  317. if (min_i >= GEMM_P * 2) {
  318. min_i = GEMM_P;
  319. } else {
  320. if (min_i > GEMM_P) {
  321. min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
  322. } else {
  323. if (args -> nthreads == 1) l1stride = 0;
  324. }
  325. }
  326. /* Copy local region of A into workspace */
  327. START_RPCC();
  328. ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  329. STOP_RPCC(copy_A);
  330. /* Copy local region of B into workspace and apply kernel */
  331. div_n = (n_to - n_from + divide_rate - 1) / divide_rate;
  332. for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
  333. /* Make sure if no one is using workspace */
  334. START_RPCC();
  335. for (i = 0; i < args -> nthreads; i++)
  336. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
  337. STOP_RPCC(waiting1);
  338. MB;
  339. #if defined(FUSED_GEMM) && !defined(TIMING)
  340. /* Fused operation to copy region of B into workspace and apply kernel */
  341. FUSED_KERNEL_OPERATION(min_i, MIN(n_to, js + div_n) - js, min_l, alpha,
  342. sa, buffer[bufferside], b, ldb, c, ldc, m_from, js, ls);
  343. #else
  344. /* Split local region of B into parts */
  345. for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
  346. min_jj = MIN(n_to, js + div_n) - jjs;
  347. #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
  348. /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
  349. if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
  350. #else
  351. if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
  352. else
  353. /*
  354. if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
  355. else
  356. */
  357. if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
  358. #endif
  359. /* Copy part of local region of B into workspace */
  360. START_RPCC();
  361. OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
  362. buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride);
  363. STOP_RPCC(copy_B);
  364. /* Apply kernel with local region of A and part of local region of B */
  365. START_RPCC();
  366. KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
  367. sa, buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride,
  368. c, ldc, m_from, jjs);
  369. STOP_RPCC(kernel);
  370. #ifdef TIMING
  371. ops += 2 * min_i * min_jj * min_l;
  372. #endif
  373. }
  374. #endif
  375. WMB;
  376. /* Set flag so other threads can access local region of B */
  377. for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
  378. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  379. }
  380. /* Get regions of B from other threads and apply kernel */
  381. current = mypos;
  382. do {
  383. /* This thread accesses regions of B from threads in the range
  384. * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */
  385. current ++;
  386. if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;
  387. /* Split other region of B into parts */
  388. div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate;
  389. for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
  390. if (current != mypos) {
  391. /* Wait until other region of B is initialized */
  392. START_RPCC();
  393. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
  394. STOP_RPCC(waiting2);
  395. MB;
  396. /* Apply kernel with local region of A and part of other region of B */
  397. START_RPCC();
  398. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
  399. sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  400. c, ldc, m_from, js);
  401. STOP_RPCC(kernel);
  402. #ifdef TIMING
  403. ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
  404. #endif
  405. }
  406. /* Clear synchronization flag if this thread is done with other region of B */
  407. if (m_to - m_from == min_i) {
  408. WMB;
  409. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  410. }
  411. }
  412. } while (current != mypos);
  413. /* Iterate through steps of m
  414. * Note: First step has already been finished */
  415. for(is = m_from + min_i; is < m_to; is += min_i){
  416. min_i = m_to - is;
  417. if (min_i >= GEMM_P * 2) {
  418. min_i = GEMM_P;
  419. } else
  420. if (min_i > GEMM_P) {
  421. min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
  422. }
  423. /* Copy local region of A into workspace */
  424. START_RPCC();
  425. ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  426. STOP_RPCC(copy_A);
  427. /* Get regions of B and apply kernel */
  428. current = mypos;
  429. do {
  430. /* Split region of B into parts and apply kernel */
  431. div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate;
  432. for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
  433. /* Apply kernel with local region of A and part of region of B */
  434. START_RPCC();
  435. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
  436. sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  437. c, ldc, is, js);
  438. STOP_RPCC(kernel);
  439. #ifdef TIMING
  440. ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
  441. #endif
  442. /* Clear synchronization flag if this thread is done with region of B */
  443. if (is + min_i >= m_to) {
  444. WMB;
  445. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  446. }
  447. }
  448. /* This thread accesses regions of B from threads in the range
  449. * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */
  450. current ++;
  451. if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;
  452. } while (current != mypos);
  453. }
  454. }
  455. /* Wait until all other threads are done with local region of B */
  456. START_RPCC();
  457. for (i = 0; i < args -> nthreads; i++) {
  458. for (js = 0; js < divide_rate; js++) {
  459. while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
  460. }
  461. }
  462. STOP_RPCC(waiting3);
  463. MB;
  464. #ifdef TIMING
  465. BLASLONG waiting = waiting1 + waiting2 + waiting3;
  466. BLASLONG total = copy_A + copy_B + kernel + waiting;
  467. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
  468. mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
  469. (double)waiting1 /(double)total * 100.,
  470. (double)waiting2 /(double)total * 100.,
  471. (double)waiting3 /(double)total * 100.,
  472. (double)ops/(double)kernel / 4. * 100.);
  473. fprintf(stderr, "\n");
  474. #endif
  475. return 0;
  476. }
  477. static int round_up(int remainder, int width, int multiple)
  478. {
  479. if (multiple > remainder || width <= multiple)
  480. return width;
  481. width = (width + multiple - 1) / multiple;
  482. width = width * multiple;
  483. return width;
  484. }
  485. static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  486. *range_n, IFLOAT *sa, IFLOAT *sb,
  487. BLASLONG nthreads_m, BLASLONG nthreads_n) {
  488. #ifdef USE_OPENMP
  489. static omp_lock_t level3_lock, critical_section_lock;
  490. static volatile BLASULONG init_lock = 0, omp_lock_initialized = 0,
  491. parallel_section_left = MAX_PARALLEL_NUMBER;
  492. // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c
  493. while(omp_lock_initialized == 0)
  494. {
  495. blas_lock(&init_lock);
  496. {
  497. if(omp_lock_initialized == 0)
  498. {
  499. omp_init_lock(&level3_lock);
  500. omp_init_lock(&critical_section_lock);
  501. omp_lock_initialized = 1;
  502. WMB;
  503. }
  504. blas_unlock(&init_lock);
  505. }
  506. }
  507. #elif defined(OS_WINDOWS)
  508. CRITICAL_SECTION level3_lock;
  509. InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
  510. #else
  511. static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
  512. static pthread_cond_t level3_wakeup = PTHREAD_COND_INITIALIZER;
  513. volatile static BLASLONG CPU_AVAILABLE = MAX_CPU_NUMBER;
  514. #endif
  515. blas_arg_t newarg;
  516. #ifndef USE_ALLOC_HEAP
  517. job_t job[MAX_CPU_NUMBER];
  518. #else
  519. job_t * job = NULL;
  520. #endif
  521. blas_queue_t queue[MAX_CPU_NUMBER];
  522. BLASLONG range_M_buffer[MAX_CPU_NUMBER + 2];
  523. BLASLONG range_N_buffer[MAX_CPU_NUMBER + 2];
  524. BLASLONG *range_M, *range_N;
  525. BLASLONG num_parts;
  526. BLASLONG nthreads = args -> nthreads;
  527. BLASLONG width, width_n, i, j, k, js;
  528. BLASLONG m, n, n_from, n_to;
  529. int mode;
  530. #if defined(DYNAMIC_ARCH)
  531. int switch_ratio = gotoblas->switch_ratio;
  532. #else
  533. int switch_ratio = SWITCH_RATIO;
  534. #endif
  535. /* Get execution mode */
  536. #ifndef COMPLEX
  537. #ifdef XDOUBLE
  538. mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
  539. #elif defined(DOUBLE)
  540. mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
  541. #else
  542. mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
  543. #endif
  544. #else
  545. #ifdef XDOUBLE
  546. mode = BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE;
  547. #elif defined(DOUBLE)
  548. mode = BLAS_DOUBLE | BLAS_COMPLEX | BLAS_NODE;
  549. #else
  550. mode = BLAS_SINGLE | BLAS_COMPLEX | BLAS_NODE;
  551. #endif
  552. #endif
  553. #ifdef USE_OPENMP
  554. omp_set_lock(&level3_lock);
  555. omp_set_lock(&critical_section_lock);
  556. parallel_section_left--;
  557. /*
  558. How OpenMP locks works with NUM_PARALLEL
  559. 1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions
  560. 2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls
  561. 3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required
  562. 4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter
  563. */
  564. if(parallel_section_left != 0)
  565. omp_unset_lock(&level3_lock);
  566. omp_unset_lock(&critical_section_lock);
  567. #elif defined(OS_WINDOWS)
  568. EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
  569. #else
  570. pthread_mutex_lock(&level3_lock);
  571. while(CPU_AVAILABLE < nthreads) {
  572. pthread_cond_wait(&level3_wakeup, &level3_lock);
  573. }
  574. CPU_AVAILABLE -= nthreads;
  575. WMB;
  576. pthread_mutex_unlock(&level3_lock);
  577. #endif
  578. #ifdef USE_ALLOC_HEAP
  579. /* Dynamically allocate workspace */
  580. job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  581. if(job==NULL){
  582. fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
  583. exit(1);
  584. }
  585. #endif
  586. /* Initialize struct for arguments */
  587. newarg.m = args -> m;
  588. newarg.n = args -> n;
  589. newarg.k = args -> k;
  590. newarg.a = args -> a;
  591. newarg.b = args -> b;
  592. newarg.c = args -> c;
  593. newarg.lda = args -> lda;
  594. newarg.ldb = args -> ldb;
  595. newarg.ldc = args -> ldc;
  596. newarg.alpha = args -> alpha;
  597. newarg.beta = args -> beta;
  598. newarg.nthreads = args -> nthreads;
  599. newarg.common = (void *)job;
  600. #ifdef PARAMTEST
  601. newarg.gemm_p = args -> gemm_p;
  602. newarg.gemm_q = args -> gemm_q;
  603. newarg.gemm_r = args -> gemm_r;
  604. #endif
  605. /* Initialize partitions in m and n
  606. * Note: The number of CPU partitions is stored in the -1 entry */
  607. range_M = &range_M_buffer[1];
  608. range_N = &range_N_buffer[1];
  609. range_M[-1] = nthreads_m;
  610. range_N[-1] = nthreads_n;
  611. if (!range_m) {
  612. range_M[0] = 0;
  613. m = args -> m;
  614. } else {
  615. range_M[0] = range_m[0];
  616. m = range_m[1] - range_m[0];
  617. }
  618. /* Partition m into nthreads_m regions */
  619. num_parts = 0;
  620. while (m > 0){
  621. width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
  622. width = round_up(m, width, GEMM_PREFERED_SIZE);
  623. m -= width;
  624. if (m < 0) width = width + m;
  625. range_M[num_parts + 1] = range_M[num_parts] + width;
  626. num_parts ++;
  627. }
  628. for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
  629. range_M[i + 1] = range_M[num_parts];
  630. }
  631. /* Initialize parameters for parallel execution */
  632. for (i = 0; i < nthreads; i++) {
  633. queue[i].mode = mode;
  634. queue[i].routine = inner_thread;
  635. queue[i].args = &newarg;
  636. queue[i].range_m = range_M;
  637. queue[i].range_n = range_N;
  638. queue[i].sa = NULL;
  639. queue[i].sb = NULL;
  640. queue[i].next = &queue[i + 1];
  641. }
  642. queue[0].sa = sa;
  643. queue[0].sb = sb;
  644. queue[nthreads - 1].next = NULL;
  645. /* Iterate through steps of n */
  646. if (!range_n) {
  647. n_from = 0;
  648. n_to = args -> n;
  649. } else {
  650. n_from = range_n[0];
  651. n_to = range_n[1];
  652. }
  653. for(js = n_from; js < n_to; js += GEMM_R * nthreads){
  654. n = n_to - js;
  655. if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
  656. /* Partition (a step of) n into nthreads regions */
  657. range_N[0] = js;
  658. num_parts = 0;
  659. for(j = 0; j < nthreads_n; j++){
  660. width_n = blas_quickdivide(n + nthreads_n - j - 1, nthreads_n - j);
  661. n -= width_n;
  662. for(i = 0; i < nthreads_m; i++){
  663. width = blas_quickdivide(width_n + nthreads_m - i - 1, nthreads_m - i);
  664. if (width < switch_ratio) {
  665. width = switch_ratio;
  666. }
  667. width = round_up(width_n, width, GEMM_PREFERED_SIZE);
  668. width_n -= width;
  669. if (width_n < 0) {
  670. width = width + width_n;
  671. width_n = 0;
  672. }
  673. range_N[num_parts + 1] = range_N[num_parts] + width;
  674. num_parts ++;
  675. }
  676. }
  677. for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
  678. range_N[j + 1] = range_N[num_parts];
  679. }
  680. /* Clear synchronization flags */
  681. for (i = 0; i < nthreads; i++) {
  682. for (j = 0; j < nthreads; j++) {
  683. for (k = 0; k < DIVIDE_RATE; k++) {
  684. job[i].working[j][CACHE_LINE_SIZE * k] = 0;
  685. }
  686. }
  687. }
  688. WMB;
  689. /* Execute parallel computation */
  690. exec_blas(nthreads, queue);
  691. }
  692. #ifdef USE_ALLOC_HEAP
  693. free(job);
  694. #endif
  695. #ifdef USE_OPENMP
  696. omp_set_lock(&critical_section_lock);
  697. parallel_section_left++;
  698. /*
  699. Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call
  700. otherwise just increment the parallel_section_left
  701. The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count
  702. */
  703. if(parallel_section_left == 1)
  704. omp_unset_lock(&level3_lock);
  705. omp_unset_lock(&critical_section_lock);
  706. #elif defined(OS_WINDOWS)
  707. LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
  708. #else
  709. pthread_mutex_lock(&level3_lock);
  710. CPU_AVAILABLE += nthreads;
  711. WMB;
  712. pthread_cond_signal(&level3_wakeup);
  713. pthread_mutex_unlock(&level3_lock);
  714. #endif
  715. return 0;
  716. }
  717. int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){
  718. BLASLONG m = args -> m;
  719. BLASLONG n = args -> n;
  720. BLASLONG nthreads_m, nthreads_n;
  721. #if defined(DYNAMIC_ARCH)
  722. int switch_ratio = gotoblas->switch_ratio;
  723. #else
  724. int switch_ratio = SWITCH_RATIO;
  725. #endif
  726. /* Get dimensions from index ranges if available */
  727. if (range_m) {
  728. m = range_m[1] - range_m[0];
  729. }
  730. if (range_n) {
  731. n = range_n[1] - range_n[0];
  732. }
  733. /* Partitions in m should have at least switch_ratio rows */
  734. if (m < 2 * switch_ratio) {
  735. nthreads_m = 1;
  736. } else {
  737. nthreads_m = args -> nthreads;
  738. while (m < nthreads_m * switch_ratio) {
  739. nthreads_m = nthreads_m / 2;
  740. }
  741. }
  742. /* Partitions in n should have at most switch_ratio * nthreads_m columns */
  743. if (n < switch_ratio * nthreads_m) {
  744. nthreads_n = 1;
  745. } else {
  746. nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
  747. if (nthreads_m * nthreads_n > args -> nthreads) {
  748. nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
  749. }
  750. /* The nthreads_m and nthreads_n are adjusted so that the submatrix */
  751. /* to be handled by each thread preferably becomes a square matrix */
  752. /* by minimizing an objective function 'n * nthreads_m + m * nthreads_n'. */
  753. /* Objective function come from sum of partitions in m and n. */
  754. /* (n / nthreads_n) + (m / nthreads_m) */
  755. /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */
  756. BLASLONG cost = 0, div = 0;
  757. BLASLONG i;
  758. for (i = 1; i <= sqrt(nthreads_m); i++) {
  759. if (nthreads_m % i) continue;
  760. BLASLONG j = nthreads_m / i;
  761. BLASLONG cost_i = n * j + m * nthreads_n * i;
  762. BLASLONG cost_j = n * i + m * nthreads_n * j;
  763. if (cost == 0 ||
  764. cost_i < cost) {cost = cost_i; div = i;}
  765. if (cost_j < cost) {cost = cost_j; div = j;}
  766. }
  767. if (div > 1) {
  768. nthreads_m /= div;
  769. nthreads_n *= div;
  770. }
  771. }
  772. /* Execute serial or parallel computation */
  773. if (nthreads_m * nthreads_n <= 1) {
  774. GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
  775. } else {
  776. args -> nthreads = nthreads_m * nthreads_n;
  777. gemm_driver(args, range_m, range_n, sa, sb, nthreads_m, nthreads_n);
  778. }
  779. return 0;
  780. }