You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

level3_thread.c 22 kB

Only initialize the part of the jobs array that will get used The jobs array is getting initialized in O(compiled cpus^2) complexity. Distros and people with bigger systems will use pretty high values (128 or 256 or more) for this value, leading to interesting bubbles in performance. Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle in the interesting range (threading kicks in at 65x65 mult by 65x65). The hardware is capable of 32 multiplications per cycle theoretically. Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10703.9 10.6 0.0% 17990.6 6.3 0.0% 64 x 64 20778.4 12.8 0.0% 40629.2 6.5 0.0% 65 x 65 26869.9 10.3 0.0% 52545.7 5.3 0.0% 80 x 80 38104.5 13.5 0.0% 72492.7 7.1 0.0% 96 x 96 61626.4 14.4 0.0% 113983.8 7.8 0.0% 112 x 112 91803.8 15.3 0.0% 180987.3 7.8 0.0% 128 x 128 133161.4 15.8 0.0% 258374.3 8.1 0.0% When threading is turned on TARGET=SKYLAKEX F_COMPILER=GFORTRAN SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0 NUM_THREADS=128 Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10725.9 10.5 -0.2% 18134.9 6.2 -0.8% 64 x 64 20500.6 12.9 1.3% 40929.1 6.5 -0.7% 65 x 65 2040832.1 0.1 -7495.2% 2097633.6 0.1 -3892.0% 80 x 80 2063129.1 0.2 -5314.4% 2119925.2 0.2 -2824.3% 96 x 96 2070374.5 0.4 -3259.6% 2173604.4 0.4 -1806.9% 112 x 112 2111721.5 0.7 -2169.6% 2263330.8 0.6 -1170.0% 128 x 128 2276181.5 0.9 -1609.3% 2377228.9 0.9 -820.1% There is a deep deep cliff once you hit 65x65 With this patch Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% The cliff is very significantly reduced. (more to follow)
7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #ifndef CACHE_LINE_SIZE
  39. #define CACHE_LINE_SIZE 8
  40. #endif
  41. #ifndef DIVIDE_RATE
  42. #define DIVIDE_RATE 2
  43. #endif
  44. #ifndef SWITCH_RATIO
  45. #define SWITCH_RATIO 2
  46. #endif
  47. //The array of job_t may overflow the stack.
  48. //Instead, use malloc to alloc job_t.
  49. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
  50. #define USE_ALLOC_HEAP
  51. #endif
  52. #ifndef GEMM_LOCAL
  53. #if defined(NN)
  54. #define GEMM_LOCAL GEMM_NN
  55. #elif defined(NT)
  56. #define GEMM_LOCAL GEMM_NT
  57. #elif defined(NR)
  58. #define GEMM_LOCAL GEMM_NR
  59. #elif defined(NC)
  60. #define GEMM_LOCAL GEMM_NC
  61. #elif defined(TN)
  62. #define GEMM_LOCAL GEMM_TN
  63. #elif defined(TT)
  64. #define GEMM_LOCAL GEMM_TT
  65. #elif defined(TR)
  66. #define GEMM_LOCAL GEMM_TR
  67. #elif defined(TC)
  68. #define GEMM_LOCAL GEMM_TC
  69. #elif defined(RN)
  70. #define GEMM_LOCAL GEMM_RN
  71. #elif defined(RT)
  72. #define GEMM_LOCAL GEMM_RT
  73. #elif defined(RR)
  74. #define GEMM_LOCAL GEMM_RR
  75. #elif defined(RC)
  76. #define GEMM_LOCAL GEMM_RC
  77. #elif defined(CN)
  78. #define GEMM_LOCAL GEMM_CN
  79. #elif defined(CT)
  80. #define GEMM_LOCAL GEMM_CT
  81. #elif defined(CR)
  82. #define GEMM_LOCAL GEMM_CR
  83. #elif defined(CC)
  84. #define GEMM_LOCAL GEMM_CC
  85. #endif
  86. #endif
  87. typedef struct {
  88. volatile
  89. BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
  90. } job_t;
  91. #ifndef BETA_OPERATION
  92. #ifndef COMPLEX
  93. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  94. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  95. BETA[0], NULL, 0, NULL, 0, \
  96. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  97. #else
  98. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  99. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  100. BETA[0], BETA[1], NULL, 0, NULL, 0, \
  101. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  102. #endif
  103. #endif
  104. #ifndef ICOPY_OPERATION
  105. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  106. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  107. #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  108. #else
  109. #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  110. #endif
  111. #endif
  112. #ifndef OCOPY_OPERATION
  113. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  114. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  115. #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  116. #else
  117. #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  118. #endif
  119. #endif
  120. #ifndef KERNEL_FUNC
  121. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  122. #define KERNEL_FUNC GEMM_KERNEL_N
  123. #endif
  124. #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
  125. #define KERNEL_FUNC GEMM_KERNEL_L
  126. #endif
  127. #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
  128. #define KERNEL_FUNC GEMM_KERNEL_R
  129. #endif
  130. #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
  131. #define KERNEL_FUNC GEMM_KERNEL_B
  132. #endif
  133. #endif
  134. #ifndef KERNEL_OPERATION
  135. #ifndef COMPLEX
  136. #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
  137. KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  138. #else
  139. #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
  140. KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  141. #endif
  142. #endif
  143. #ifndef FUSED_KERNEL_OPERATION
  144. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  145. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  146. #ifndef COMPLEX
  147. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  148. FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \
  149. (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  150. #else
  151. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  152. FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \
  153. (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  154. #endif
  155. #else
  156. #ifndef COMPLEX
  157. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  158. FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \
  159. (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  160. #else
  161. #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \
  162. FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \
  163. (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC)
  164. #endif
  165. #endif
  166. #endif
  167. #ifndef A
  168. #define A args -> a
  169. #endif
  170. #ifndef LDA
  171. #define LDA args -> lda
  172. #endif
  173. #ifndef B
  174. #define B args -> b
  175. #endif
  176. #ifndef LDB
  177. #define LDB args -> ldb
  178. #endif
  179. #ifndef C
  180. #define C args -> c
  181. #endif
  182. #ifndef LDC
  183. #define LDC args -> ldc
  184. #endif
  185. #ifndef M
  186. #define M args -> m
  187. #endif
  188. #ifndef N
  189. #define N args -> n
  190. #endif
  191. #ifndef K
  192. #define K args -> k
  193. #endif
  194. #ifdef TIMING
  195. #define START_RPCC() rpcc_counter = rpcc()
  196. #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
  197. #else
  198. #define START_RPCC()
  199. #define STOP_RPCC(COUNTER)
  200. #endif
  201. static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  202. FLOAT *buffer[DIVIDE_RATE];
  203. BLASLONG k, lda, ldb, ldc;
  204. BLASLONG m_from, m_to, n_from, n_to;
  205. FLOAT *alpha, *beta;
  206. FLOAT *a, *b, *c;
  207. job_t *job = (job_t *)args -> common;
  208. BLASLONG nthreads_m;
  209. BLASLONG mypos_m, mypos_n;
  210. BLASLONG is, js, ls, bufferside, jjs;
  211. BLASLONG min_i, min_l, div_n, min_jj;
  212. BLASLONG i, current;
  213. BLASLONG l1stride;
  214. #ifdef TIMING
  215. BLASULONG rpcc_counter;
  216. BLASULONG copy_A = 0;
  217. BLASULONG copy_B = 0;
  218. BLASULONG kernel = 0;
  219. BLASULONG waiting1 = 0;
  220. BLASULONG waiting2 = 0;
  221. BLASULONG waiting3 = 0;
  222. BLASULONG waiting6[MAX_CPU_NUMBER];
  223. BLASULONG ops = 0;
  224. for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
  225. #endif
  226. k = K;
  227. a = (FLOAT *)A;
  228. b = (FLOAT *)B;
  229. c = (FLOAT *)C;
  230. lda = LDA;
  231. ldb = LDB;
  232. ldc = LDC;
  233. alpha = (FLOAT *)args -> alpha;
  234. beta = (FLOAT *)args -> beta;
  235. /* Initialize 2D CPU distribution */
  236. nthreads_m = args -> nthreads;
  237. if (range_m) {
  238. nthreads_m = range_m[-1];
  239. }
  240. mypos_n = blas_quickdivide(mypos, nthreads_m); /* mypos_n = mypos / nthreads_m */
  241. mypos_m = mypos - mypos_n * nthreads_m; /* mypos_m = mypos % nthreads_m */
  242. /* Initialize m and n */
  243. m_from = 0;
  244. m_to = M;
  245. if (range_m) {
  246. m_from = range_m[mypos_m + 0];
  247. m_to = range_m[mypos_m + 1];
  248. }
  249. n_from = 0;
  250. n_to = N;
  251. if (range_n) {
  252. n_from = range_n[mypos + 0];
  253. n_to = range_n[mypos + 1];
  254. }
  255. /* Multiply C by beta if needed */
  256. if (beta) {
  257. #ifndef COMPLEX
  258. if (beta[0] != ONE)
  259. #else
  260. if ((beta[0] != ONE) || (beta[1] != ZERO))
  261. #endif
  262. BETA_OPERATION(m_from, m_to, range_n[mypos_n * nthreads_m], range_n[(mypos_n + 1) * nthreads_m], beta, c, ldc);
  263. }
  264. /* Return early if no more computation is needed */
  265. if ((k == 0) || (alpha == NULL)) return 0;
  266. if (alpha[0] == ZERO
  267. #ifdef COMPLEX
  268. && alpha[1] == ZERO
  269. #endif
  270. ) return 0;
  271. /* Initialize workspace for local region of B */
  272. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  273. buffer[0] = sb;
  274. for (i = 1; i < DIVIDE_RATE; i++) {
  275. buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
  276. }
  277. /* Iterate through steps of k */
  278. for(ls = 0; ls < k; ls += min_l){
  279. /* Determine step size in k */
  280. min_l = k - ls;
  281. if (min_l >= GEMM_Q * 2) {
  282. min_l = GEMM_Q;
  283. } else {
  284. if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
  285. }
  286. /* Determine step size in m
  287. * Note: We are currently on the first step in m
  288. */
  289. l1stride = 1;
  290. min_i = m_to - m_from;
  291. if (min_i >= GEMM_P * 2) {
  292. min_i = GEMM_P;
  293. } else {
  294. if (min_i > GEMM_P) {
  295. min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
  296. } else {
  297. if (args -> nthreads == 1) l1stride = 0;
  298. }
  299. }
  300. /* Copy local region of A into workspace */
  301. START_RPCC();
  302. ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  303. STOP_RPCC(copy_A);
  304. /* Copy local region of B into workspace and apply kernel */
  305. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  306. for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
  307. #if defined(FUSED_GEMM) && !defined(TIMING)
  308. /* Fused operation to copy region of B into workspace and apply kernel */
  309. FUSED_KERNEL_OPERATION(min_i, MIN(n_to, js + div_n) - js, min_l, alpha,
  310. sa, buffer[bufferside], b, ldb, c, ldc, m_from, js, ls);
  311. #else
  312. /* Split local region of B into parts */
  313. for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
  314. min_jj = MIN(n_to, js + div_n) - jjs;
  315. if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
  316. else
  317. if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
  318. else
  319. if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
  320. /* Copy part of local region of B into workspace */
  321. START_RPCC();
  322. OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
  323. buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride);
  324. STOP_RPCC(copy_B);
  325. /* Apply kernel with local region of A and part of local region of B */
  326. START_RPCC();
  327. KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
  328. sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride,
  329. c, ldc, m_from, jjs);
  330. STOP_RPCC(kernel);
  331. #ifdef TIMING
  332. ops += 2 * min_i * min_jj * min_l;
  333. #endif
  334. }
  335. #endif
  336. for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
  337. /* Make sure if no one is using workspace */
  338. START_RPCC();
  339. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
  340. STOP_RPCC(waiting1);
  341. /* Set flag so other threads can access local region of B */
  342. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  343. WMB;
  344. }
  345. }
  346. /* Get regions of B from other threads and apply kernel */
  347. current = mypos;
  348. do {
  349. /* This thread accesses regions of B from threads in the range
  350. * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */
  351. current ++;
  352. if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;
  353. /* Split other region of B into parts */
  354. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  355. for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
  356. if (current != mypos) {
  357. /* Wait until other region of B is initialized */
  358. START_RPCC();
  359. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
  360. STOP_RPCC(waiting2);
  361. /* Apply kernel with local region of A and part of other region of B */
  362. START_RPCC();
  363. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
  364. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  365. c, ldc, m_from, js);
  366. STOP_RPCC(kernel);
  367. #ifdef TIMING
  368. ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
  369. #endif
  370. }
  371. /* Clear synchronization flag if this thread is done with other region of B */
  372. if (m_to - m_from == min_i) {
  373. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  374. WMB;
  375. }
  376. }
  377. } while (current != mypos);
  378. /* Iterate through steps of m
  379. * Note: First step has already been finished */
  380. for(is = m_from + min_i; is < m_to; is += min_i){
  381. min_i = m_to - is;
  382. if (min_i >= GEMM_P * 2) {
  383. min_i = GEMM_P;
  384. } else
  385. if (min_i > GEMM_P) {
  386. min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
  387. }
  388. /* Copy local region of A into workspace */
  389. START_RPCC();
  390. ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  391. STOP_RPCC(copy_A);
  392. /* Get regions of B and apply kernel */
  393. current = mypos;
  394. do {
  395. /* Split region of B into parts and apply kernel */
  396. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  397. for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
  398. /* Apply kernel with local region of A and part of region of B */
  399. START_RPCC();
  400. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
  401. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  402. c, ldc, is, js);
  403. STOP_RPCC(kernel);
  404. #ifdef TIMING
  405. ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
  406. #endif
  407. /* Clear synchronization flag if this thread is done with region of B */
  408. if (is + min_i >= m_to) {
  409. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  410. WMB;
  411. }
  412. }
  413. /* This thread accesses regions of B from threads in the range
  414. * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */
  415. current ++;
  416. if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;
  417. } while (current != mypos);
  418. }
  419. }
  420. /* Wait until all other threads are done with local region of B */
  421. START_RPCC();
  422. for (i = 0; i < args -> nthreads; i++) {
  423. for (js = 0; js < DIVIDE_RATE; js++) {
  424. while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
  425. }
  426. }
  427. STOP_RPCC(waiting3);
  428. #ifdef TIMING
  429. BLASLONG waiting = waiting1 + waiting2 + waiting3;
  430. BLASLONG total = copy_A + copy_B + kernel + waiting;
  431. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
  432. mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
  433. (double)waiting1 /(double)total * 100.,
  434. (double)waiting2 /(double)total * 100.,
  435. (double)waiting3 /(double)total * 100.,
  436. (double)ops/(double)kernel / 4. * 100.);
  437. fprintf(stderr, "\n");
  438. #endif
  439. return 0;
  440. }
  441. static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  442. *range_n, FLOAT *sa, FLOAT *sb,
  443. BLASLONG nthreads_m, BLASLONG nthreads_n) {
  444. blas_arg_t newarg;
  445. #ifndef USE_ALLOC_HEAP
  446. job_t job[MAX_CPU_NUMBER];
  447. #else
  448. job_t * job = NULL;
  449. #endif
  450. blas_queue_t queue[MAX_CPU_NUMBER];
  451. BLASLONG range_M_buffer[MAX_CPU_NUMBER + 2];
  452. BLASLONG range_N_buffer[MAX_CPU_NUMBER + 2];
  453. BLASLONG *range_M, *range_N;
  454. BLASLONG num_parts;
  455. BLASLONG nthreads = args -> nthreads;
  456. BLASLONG width, i, j, k, js;
  457. BLASLONG m, n, n_from, n_to;
  458. int mode;
  459. /* Get execution mode */
  460. #ifndef COMPLEX
  461. #ifdef XDOUBLE
  462. mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
  463. #elif defined(DOUBLE)
  464. mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
  465. #else
  466. mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
  467. #endif
  468. #else
  469. #ifdef XDOUBLE
  470. mode = BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE;
  471. #elif defined(DOUBLE)
  472. mode = BLAS_DOUBLE | BLAS_COMPLEX | BLAS_NODE;
  473. #else
  474. mode = BLAS_SINGLE | BLAS_COMPLEX | BLAS_NODE;
  475. #endif
  476. #endif
  477. #ifdef USE_ALLOC_HEAP
  478. /* Dynamically allocate workspace */
  479. job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  480. if(job==NULL){
  481. fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
  482. exit(1);
  483. }
  484. #endif
  485. /* Initialize struct for arguments */
  486. newarg.m = args -> m;
  487. newarg.n = args -> n;
  488. newarg.k = args -> k;
  489. newarg.a = args -> a;
  490. newarg.b = args -> b;
  491. newarg.c = args -> c;
  492. newarg.lda = args -> lda;
  493. newarg.ldb = args -> ldb;
  494. newarg.ldc = args -> ldc;
  495. newarg.alpha = args -> alpha;
  496. newarg.beta = args -> beta;
  497. newarg.nthreads = args -> nthreads;
  498. newarg.common = (void *)job;
  499. #ifdef PARAMTEST
  500. newarg.gemm_p = args -> gemm_p;
  501. newarg.gemm_q = args -> gemm_q;
  502. newarg.gemm_r = args -> gemm_r;
  503. #endif
  504. /* Initialize partitions in m and n
  505. * Note: The number of CPU partitions is stored in the -1 entry */
  506. range_M = &range_M_buffer[1];
  507. range_N = &range_N_buffer[1];
  508. range_M[-1] = nthreads_m;
  509. range_N[-1] = nthreads_n;
  510. if (!range_m) {
  511. range_M[0] = 0;
  512. m = args -> m;
  513. } else {
  514. range_M[0] = range_m[0];
  515. m = range_m[1] - range_m[0];
  516. }
  517. /* Partition m into nthreads_m regions */
  518. num_parts = 0;
  519. while (m > 0){
  520. width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
  521. m -= width;
  522. if (m < 0) width = width + m;
  523. range_M[num_parts + 1] = range_M[num_parts] + width;
  524. num_parts ++;
  525. }
  526. for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
  527. range_M[i + 1] = range_M[num_parts];
  528. }
  529. /* Initialize parameters for parallel execution */
  530. for (i = 0; i < nthreads; i++) {
  531. queue[i].mode = mode;
  532. queue[i].routine = inner_thread;
  533. queue[i].args = &newarg;
  534. queue[i].range_m = range_M;
  535. queue[i].range_n = range_N;
  536. queue[i].sa = NULL;
  537. queue[i].sb = NULL;
  538. queue[i].next = &queue[i + 1];
  539. }
  540. queue[0].sa = sa;
  541. queue[0].sb = sb;
  542. queue[nthreads - 1].next = NULL;
  543. /* Iterate through steps of n */
  544. if (!range_n) {
  545. n_from = 0;
  546. n_to = args -> n;
  547. } else {
  548. n_from = range_n[0];
  549. n_to = range_n[1];
  550. }
  551. for(js = n_from; js < n_to; js += GEMM_R * nthreads){
  552. n = n_to - js;
  553. if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
  554. /* Partition (a step of) n into nthreads regions */
  555. range_N[0] = js;
  556. num_parts = 0;
  557. while (n > 0){
  558. width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
  559. if (width < SWITCH_RATIO) {
  560. width = SWITCH_RATIO;
  561. }
  562. n -= width;
  563. if (n < 0) width = width + n;
  564. range_N[num_parts + 1] = range_N[num_parts] + width;
  565. num_parts ++;
  566. }
  567. for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
  568. range_N[j + 1] = range_N[num_parts];
  569. }
  570. /* Clear synchronization flags */
  571. for (i = 0; i < nthreads; i++) {
  572. for (j = 0; j < nthreads; j++) {
  573. for (k = 0; k < DIVIDE_RATE; k++) {
  574. job[i].working[j][CACHE_LINE_SIZE * k] = 0;
  575. }
  576. }
  577. }
  578. /* Execute parallel computation */
  579. exec_blas(nthreads, queue);
  580. }
  581. #ifdef USE_ALLOC_HEAP
  582. free(job);
  583. #endif
  584. return 0;
  585. }
  586. int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  587. BLASLONG m = args -> m;
  588. BLASLONG n = args -> n;
  589. BLASLONG nthreads_m, nthreads_n;
  590. /* Get dimensions from index ranges if available */
  591. if (range_m) {
  592. m = range_m[1] - range_m[0];
  593. }
  594. if (range_n) {
  595. n = range_n[1] - range_n[0];
  596. }
  597. /* Partitions in m should have at least SWITCH_RATIO rows */
  598. if (m < 2 * SWITCH_RATIO) {
  599. nthreads_m = 1;
  600. } else {
  601. nthreads_m = args -> nthreads;
  602. while (m < nthreads_m * SWITCH_RATIO) {
  603. nthreads_m = nthreads_m / 2;
  604. }
  605. }
  606. /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
  607. if (n < SWITCH_RATIO * nthreads_m) {
  608. nthreads_n = 1;
  609. } else {
  610. nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
  611. if (nthreads_m * nthreads_n > args -> nthreads) {
  612. nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
  613. }
  614. }
  615. /* Execute serial or parallel computation */
  616. if (nthreads_m * nthreads_n <= 1) {
  617. GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
  618. } else {
  619. args -> nthreads = nthreads_m * nthreads_n;
  620. gemm_driver(args, range_m, range_n, sa, sb, nthreads_m, nthreads_n);
  621. }
  622. return 0;
  623. }