You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

level3_gemm3m_thread.c 29 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* Copyright 2023 The OpenBLAS Project. */
  4. /* All rights reserved. */
  5. /* */
  6. /* Redistribution and use in source and binary forms, with or */
  7. /* without modification, are permitted provided that the following */
  8. /* conditions are met: */
  9. /* */
  10. /* 1. Redistributions of source code must retain the above */
  11. /* copyright notice, this list of conditions and the following */
  12. /* disclaimer. */
  13. /* */
  14. /* 2. Redistributions in binary form must reproduce the above */
  15. /* copyright notice, this list of conditions and the following */
  16. /* disclaimer in the documentation and/or other materials */
  17. /* provided with the distribution. */
  18. /* */
  19. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  20. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  21. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  22. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  23. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  24. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  25. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  26. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  27. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  28. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  29. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  30. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  31. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  32. /* POSSIBILITY OF SUCH DAMAGE. */
  33. /* */
  34. /* The views and conclusions contained in the software and */
  35. /* documentation are those of the authors and should not be */
  36. /* interpreted as representing official policies, either expressed */
  37. /* or implied, of The University of Texas at Austin. */
  38. /*********************************************************************/
  39. #ifndef CACHE_LINE_SIZE
  40. #define CACHE_LINE_SIZE 8
  41. #endif
  42. #ifndef DIVIDE_RATE
  43. #define DIVIDE_RATE 2
  44. #endif
  45. //The array of job_t may overflow the stack.
  46. //Instead, use malloc to alloc job_t.
  47. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
  48. #define USE_ALLOC_HEAP
  49. #endif
  50. #ifndef GEMM3M_LOCAL
  51. #if defined(NN)
  52. #define GEMM3M_LOCAL GEMM3M_NN
  53. #elif defined(NT)
  54. #define GEMM3M_LOCAL GEMM3M_NT
  55. #elif defined(NR)
  56. #define GEMM3M_LOCAL GEMM3M_NR
  57. #elif defined(NC)
  58. #define GEMM3M_LOCAL GEMM3M_NC
  59. #elif defined(TN)
  60. #define GEMM3M_LOCAL GEMM3M_TN
  61. #elif defined(TT)
  62. #define GEMM3M_LOCAL GEMM3M_TT
  63. #elif defined(TR)
  64. #define GEMM3M_LOCAL GEMM3M_TR
  65. #elif defined(TC)
  66. #define GEMM3M_LOCAL GEMM3M_TC
  67. #elif defined(RN)
  68. #define GEMM3M_LOCAL GEMM3M_RN
  69. #elif defined(RT)
  70. #define GEMM3M_LOCAL GEMM3M_RT
  71. #elif defined(RR)
  72. #define GEMM3M_LOCAL GEMM3M_RR
  73. #elif defined(RC)
  74. #define GEMM3M_LOCAL GEMM3M_RC
  75. #elif defined(CN)
  76. #define GEMM3M_LOCAL GEMM3M_CN
  77. #elif defined(CT)
  78. #define GEMM3M_LOCAL GEMM3M_CT
  79. #elif defined(CR)
  80. #define GEMM3M_LOCAL GEMM3M_CR
  81. #elif defined(CC)
  82. #define GEMM3M_LOCAL GEMM3M_CC
  83. #endif
  84. #endif
  85. typedef struct {
  86. #ifdef HAVE_C11
  87. _Atomic
  88. #else
  89. volatile
  90. #endif
  91. BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
  92. } job_t;
  93. #ifndef BETA_OPERATION
  94. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  95. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  96. BETA[0], BETA[1], NULL, 0, NULL, 0, \
  97. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  98. #endif
  99. #ifndef ICOPYB_OPERATION
  100. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  101. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  102. #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  103. GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  104. #else
  105. #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  106. GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  107. #endif
  108. #endif
  109. #ifndef ICOPYR_OPERATION
  110. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  111. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  112. #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  113. GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  114. #else
  115. #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  116. GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  117. #endif
  118. #endif
  119. #ifndef ICOPYI_OPERATION
  120. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  121. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  122. #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  123. GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  124. #else
  125. #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  126. GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  127. #endif
  128. #endif
  129. #ifndef OCOPYB_OPERATION
  130. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  131. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  132. #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  133. GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  134. #else
  135. #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  136. GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  137. #endif
  138. #endif
  139. #ifndef OCOPYR_OPERATION
  140. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  141. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  142. #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  143. GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  144. #else
  145. #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  146. GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  147. #endif
  148. #endif
  149. #ifndef OCOPYI_OPERATION
  150. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  151. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  152. #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  153. GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  154. #else
  155. #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  156. GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  157. #endif
  158. #endif
  159. #ifndef KERNEL_FUNC
  160. #define KERNEL_FUNC GEMM3M_KERNEL
  161. #endif
  162. #ifndef KERNEL_OPERATION
  163. #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \
  164. KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  165. #endif
  166. #ifndef A
  167. #define A args -> a
  168. #endif
  169. #ifndef LDA
  170. #define LDA args -> lda
  171. #endif
  172. #ifndef B
  173. #define B args -> b
  174. #endif
  175. #ifndef LDB
  176. #define LDB args -> ldb
  177. #endif
  178. #ifndef C
  179. #define C args -> c
  180. #endif
  181. #ifndef LDC
  182. #define LDC args -> ldc
  183. #endif
  184. #ifndef M
  185. #define M args -> m
  186. #endif
  187. #ifndef N
  188. #define N args -> n
  189. #endif
  190. #ifndef K
  191. #define K args -> k
  192. #endif
  193. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  194. #define ALPHA1 ONE
  195. #define ALPHA2 ONE
  196. #define ALPHA5 ZERO
  197. #define ALPHA6 ONE
  198. #define ALPHA7 ONE
  199. #define ALPHA8 ZERO
  200. #define ALPHA11 ONE
  201. #define ALPHA12 -ONE
  202. #define ALPHA13 ZERO
  203. #define ALPHA14 ONE
  204. #define ALPHA17 -ONE
  205. #define ALPHA18 -ONE
  206. #endif
  207. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  208. #define ALPHA1 ONE
  209. #define ALPHA2 ONE
  210. #define ALPHA5 ONE
  211. #define ALPHA6 ZERO
  212. #define ALPHA7 ZERO
  213. #define ALPHA8 ONE
  214. #define ALPHA11 -ONE
  215. #define ALPHA12 -ONE
  216. #define ALPHA13 ONE
  217. #define ALPHA14 ZERO
  218. #define ALPHA17 -ONE
  219. #define ALPHA18 ONE
  220. #endif
  221. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  222. #define ALPHA1 ONE
  223. #define ALPHA2 ONE
  224. #define ALPHA5 ONE
  225. #define ALPHA6 ZERO
  226. #define ALPHA7 ZERO
  227. #define ALPHA8 ONE
  228. #define ALPHA11 -ONE
  229. #define ALPHA12 ONE
  230. #define ALPHA13 ONE
  231. #define ALPHA14 ZERO
  232. #define ALPHA17 -ONE
  233. #define ALPHA18 -ONE
  234. #endif
  235. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  236. #define ALPHA1 ONE
  237. #define ALPHA2 ONE
  238. #define ALPHA5 ZERO
  239. #define ALPHA6 -ONE
  240. #define ALPHA7 ONE
  241. #define ALPHA8 ZERO
  242. #define ALPHA11 ONE
  243. #define ALPHA12 ONE
  244. #define ALPHA13 ZERO
  245. #define ALPHA14 ONE
  246. #define ALPHA17 -ONE
  247. #define ALPHA18 ONE
  248. #endif
  249. #ifdef TIMING
  250. #define START_RPCC() rpcc_counter = rpcc()
  251. #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
  252. #else
  253. #define START_RPCC()
  254. #define STOP_RPCC(COUNTER)
  255. #endif
  256. static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  257. BLASLONG k, lda, ldb, ldc;
  258. BLASLONG m_from, m_to, n_from, n_to, N_from, N_to;
  259. FLOAT *alpha, *beta;
  260. FLOAT *a, *b, *c;
  261. job_t *job = (job_t *)args -> common;
  262. BLASLONG xxx, bufferside;
  263. FLOAT *buffer[DIVIDE_RATE];
  264. BLASLONG ls, min_l, jjs, min_jj;
  265. BLASLONG is, min_i, div_n;
  266. BLASLONG i, current;
  267. #ifdef TIMING
  268. BLASLONG rpcc_counter;
  269. BLASLONG copy_A = 0;
  270. BLASLONG copy_B = 0;
  271. BLASLONG kernel = 0;
  272. BLASLONG waiting1 = 0;
  273. BLASLONG waiting2 = 0;
  274. BLASLONG waiting3 = 0;
  275. BLASLONG waiting6[MAX_CPU_NUMBER];
  276. BLASLONG ops = 0;
  277. for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
  278. #endif
  279. k = K;
  280. a = (FLOAT *)A;
  281. b = (FLOAT *)B;
  282. c = (FLOAT *)C;
  283. lda = LDA;
  284. ldb = LDB;
  285. ldc = LDC;
  286. alpha = (FLOAT *)args -> alpha;
  287. beta = (FLOAT *)args -> beta;
  288. m_from = 0;
  289. m_to = M;
  290. if (range_m) {
  291. m_from = range_m[0];
  292. m_to = range_m[1];
  293. }
  294. n_from = 0;
  295. n_to = N;
  296. N_from = 0;
  297. N_to = N;
  298. if (range_n) {
  299. n_from = range_n[mypos + 0];
  300. n_to = range_n[mypos + 1];
  301. N_from = range_n[0];
  302. N_to = range_n[args -> nthreads];
  303. }
  304. if (beta) {
  305. if ((beta[0] != ONE) || (beta[1] != ZERO))
  306. BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc);
  307. }
  308. if ((k == 0) || (alpha == NULL)) return 0;
  309. if ((alpha[0] == ZERO) && (alpha[1] == ZERO)) return 0;
  310. #if 0
  311. fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
  312. mypos, m_from, m_to, n_from, n_to, N_from, N_to);
  313. #endif
  314. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  315. buffer[0] = sb;
  316. for (i = 1; i < DIVIDE_RATE; i++) {
  317. buffer[i] = buffer[i - 1] + GEMM3M_Q * (((div_n + GEMM3M_UNROLL_N - 1)/GEMM3M_UNROLL_N) * GEMM3M_UNROLL_N);
  318. }
  319. for(ls = 0; ls < k; ls += min_l){
  320. min_l = k - ls;
  321. if (min_l >= GEMM3M_Q * 2) {
  322. min_l = GEMM3M_Q;
  323. } else {
  324. if (min_l > GEMM3M_Q) {
  325. min_l = (min_l + 1) / 2;
  326. }
  327. }
  328. min_i = m_to - m_from;
  329. if (min_i >= GEMM3M_P * 2) {
  330. min_i = GEMM3M_P;
  331. } else {
  332. if (min_i > GEMM3M_P) {
  333. min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  334. }
  335. }
  336. START_RPCC();
  337. ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  338. STOP_RPCC(copy_A);
  339. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  340. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  341. START_RPCC();
  342. /* Make sure if no one is using another buffer */
  343. for (i = 0; i < args -> nthreads; i++)
  344. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
  345. STOP_RPCC(waiting1);
  346. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  347. min_jj = MIN(n_to, xxx + div_n) - jjs;
  348. if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
  349. START_RPCC();
  350. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
  351. OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  352. #else
  353. OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  354. #endif
  355. STOP_RPCC(copy_B);
  356. START_RPCC();
  357. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
  358. sa, buffer[bufferside] + min_l * (jjs - xxx),
  359. c, ldc, m_from, jjs);
  360. STOP_RPCC(kernel);
  361. #ifdef TIMING
  362. ops += 2 * min_i * min_jj * min_l;
  363. #endif
  364. }
  365. for (i = 0; i < args -> nthreads; i++)
  366. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  367. WMB;
  368. }
  369. current = mypos;
  370. do {
  371. current ++;
  372. if (current >= args -> nthreads) current = 0;
  373. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  374. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  375. if (current != mypos) {
  376. START_RPCC();
  377. /* thread has to wait */
  378. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
  379. STOP_RPCC(waiting2);
  380. START_RPCC();
  381. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
  382. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  383. c, ldc, m_from, xxx);
  384. STOP_RPCC(kernel);
  385. #ifdef TIMING
  386. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  387. #endif
  388. }
  389. if (m_to - m_from == min_i) {
  390. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  391. WMB;
  392. }
  393. }
  394. } while (current != mypos);
  395. for(is = m_from + min_i; is < m_to; is += min_i){
  396. min_i = m_to - is;
  397. if (min_i >= GEMM3M_P * 2) {
  398. min_i = GEMM3M_P;
  399. } else
  400. if (min_i > GEMM3M_P) {
  401. min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  402. }
  403. START_RPCC();
  404. ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  405. STOP_RPCC(copy_A);
  406. current = mypos;
  407. do {
  408. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  409. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  410. START_RPCC();
  411. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
  412. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  413. c, ldc, is, xxx);
  414. STOP_RPCC(kernel);
  415. #ifdef TIMING
  416. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  417. #endif
  418. if (is + min_i >= m_to) {
  419. /* Thread doesn't need this buffer any more */
  420. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  421. WMB;
  422. }
  423. }
  424. current ++;
  425. if (current >= args -> nthreads) current = 0;
  426. } while (current != mypos);
  427. } /* end of is */
  428. START_RPCC();
  429. ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  430. STOP_RPCC(copy_A);
  431. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  432. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  433. START_RPCC();
  434. /* Make sure if no one is using another buffer */
  435. for (i = 0; i < args -> nthreads; i++)
  436. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
  437. STOP_RPCC(waiting1);
  438. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  439. min_jj = MIN(n_to, xxx + div_n) - jjs;
  440. if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
  441. START_RPCC();
  442. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  443. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  444. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  445. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  446. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  447. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  448. #else
  449. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  450. #endif
  451. STOP_RPCC(copy_B);
  452. START_RPCC();
  453. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
  454. sa, buffer[bufferside] + min_l * (jjs - xxx),
  455. c, ldc, m_from, jjs);
  456. STOP_RPCC(kernel);
  457. #ifdef TIMING
  458. ops += 2 * min_i * min_jj * min_l;
  459. #endif
  460. }
  461. for (i = 0; i < args -> nthreads; i++)
  462. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  463. }
  464. current = mypos;
  465. do {
  466. current ++;
  467. if (current >= args -> nthreads) current = 0;
  468. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  469. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  470. if (current != mypos) {
  471. START_RPCC();
  472. /* thread has to wait */
  473. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
  474. STOP_RPCC(waiting2);
  475. START_RPCC();
  476. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
  477. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  478. c, ldc, m_from, xxx);
  479. STOP_RPCC(kernel);
  480. #ifdef TIMING
  481. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  482. #endif
  483. }
  484. if (m_to - m_from == min_i) {
  485. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  486. WMB;
  487. }
  488. }
  489. } while (current != mypos);
  490. for(is = m_from + min_i; is < m_to; is += min_i){
  491. min_i = m_to - is;
  492. if (min_i >= GEMM3M_P * 2) {
  493. min_i = GEMM3M_P;
  494. } else
  495. if (min_i > GEMM3M_P) {
  496. min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  497. }
  498. START_RPCC();
  499. ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  500. STOP_RPCC(copy_A);
  501. current = mypos;
  502. do {
  503. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  504. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  505. START_RPCC();
  506. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
  507. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  508. c, ldc, is, xxx);
  509. STOP_RPCC(kernel);
  510. #ifdef TIMING
  511. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  512. #endif
  513. if (is + min_i >= m_to) {
  514. /* Thread doesn't need this buffer any more */
  515. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  516. }
  517. }
  518. current ++;
  519. if (current >= args -> nthreads) current = 0;
  520. } while (current != mypos);
  521. } /* end of is */
  522. START_RPCC();
  523. ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  524. STOP_RPCC(copy_A);
  525. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  526. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  527. START_RPCC();
  528. /* Make sure if no one is using another buffer */
  529. for (i = 0; i < args -> nthreads; i++)
  530. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
  531. STOP_RPCC(waiting1);
  532. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  533. min_jj = MIN(n_to, xxx + div_n) - jjs;
  534. if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
  535. START_RPCC();
  536. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  537. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  538. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  539. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  540. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  541. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  542. #else
  543. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  544. #endif
  545. STOP_RPCC(copy_B);
  546. START_RPCC();
  547. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
  548. sa, buffer[bufferside] + min_l * (jjs - xxx),
  549. c, ldc, m_from, jjs);
  550. STOP_RPCC(kernel);
  551. #ifdef TIMING
  552. ops += 2 * min_i * min_jj * min_l;
  553. #endif
  554. }
  555. for (i = 0; i < args -> nthreads; i++)
  556. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  557. }
  558. current = mypos;
  559. do {
  560. current ++;
  561. if (current >= args -> nthreads) current = 0;
  562. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  563. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  564. if (current != mypos) {
  565. START_RPCC();
  566. /* thread has to wait */
  567. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
  568. STOP_RPCC(waiting2);
  569. START_RPCC();
  570. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
  571. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  572. c, ldc, m_from, xxx);
  573. STOP_RPCC(kernel);
  574. #ifdef TIMING
  575. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  576. #endif
  577. }
  578. if (m_to - m_from == min_i) {
  579. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  580. WMB;
  581. }
  582. }
  583. } while (current != mypos);
  584. for(is = m_from + min_i; is < m_to; is += min_i){
  585. min_i = m_to - is;
  586. if (min_i >= GEMM3M_P * 2) {
  587. min_i = GEMM3M_P;
  588. } else
  589. if (min_i > GEMM3M_P) {
  590. min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  591. }
  592. START_RPCC();
  593. ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  594. STOP_RPCC(copy_A);
  595. current = mypos;
  596. do {
  597. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  598. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  599. START_RPCC();
  600. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
  601. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  602. c, ldc, is, xxx);
  603. STOP_RPCC(kernel);
  604. #ifdef TIMING
  605. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  606. #endif
  607. if (is + min_i >= m_to) {
  608. /* Thread doesn't need this buffer any more */
  609. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  610. WMB;
  611. }
  612. }
  613. current ++;
  614. if (current >= args -> nthreads) current = 0;
  615. } while (current != mypos);
  616. } /* end of is */
  617. }
  618. START_RPCC();
  619. for (i = 0; i < args -> nthreads; i++) {
  620. for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
  621. while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
  622. }
  623. }
  624. STOP_RPCC(waiting3);
  625. #ifdef TIMING
  626. BLASLONG waiting = waiting1 + waiting2 + waiting3;
  627. BLASLONG total = copy_A + copy_B + kernel + waiting;
  628. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait : %6.2f Kernel : %6.2f\n",
  629. mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
  630. (double)waiting /(double)total * 100.,
  631. (double)ops/(double)kernel / 2. * 100.);
  632. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n",
  633. mypos, copy_A, copy_B, waiting);
  634. #if 0
  635. fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
  636. mypos,
  637. (double)waiting1/(double)waiting * 100.,
  638. (double)waiting2/(double)waiting * 100.,
  639. (double)waiting3/(double)waiting * 100.);
  640. #endif
  641. fprintf(stderr, "\n");
  642. #endif
  643. return 0;
  644. }
  645. static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  646. *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  647. #ifndef USE_OPENMP
  648. #ifndef OS_WINDOWS
  649. static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
  650. #else
  651. CRITICAL_SECTION level3_lock;
  652. InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
  653. #endif
  654. #endif
  655. blas_arg_t newarg;
  656. blas_queue_t queue[MAX_CPU_NUMBER];
  657. BLASLONG range_M[MAX_CPU_NUMBER + 1];
  658. BLASLONG range_N[MAX_CPU_NUMBER + 1];
  659. #ifndef USE_ALLOC_HEAP
  660. job_t job[MAX_CPU_NUMBER];
  661. #else
  662. job_t * job = NULL;
  663. #endif
  664. BLASLONG num_cpu_m, num_cpu_n;
  665. BLASLONG nthreads = args -> nthreads;
  666. BLASLONG width, i, j, k, js;
  667. BLASLONG m, n, n_from, n_to;
  668. int mode;
  669. #ifdef XDOUBLE
  670. mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
  671. #elif defined(DOUBLE)
  672. mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
  673. #else
  674. mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
  675. #endif
  676. #ifndef USE_OPENMP
  677. #ifndef OS_WINDOWS
  678. pthread_mutex_lock(&level3_lock);
  679. #else
  680. EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
  681. #endif
  682. #endif
  683. newarg.m = args -> m;
  684. newarg.n = args -> n;
  685. newarg.k = args -> k;
  686. newarg.a = args -> a;
  687. newarg.b = args -> b;
  688. newarg.c = args -> c;
  689. newarg.lda = args -> lda;
  690. newarg.ldb = args -> ldb;
  691. newarg.ldc = args -> ldc;
  692. newarg.alpha = args -> alpha;
  693. newarg.beta = args -> beta;
  694. newarg.nthreads = args -> nthreads;
  695. #ifdef USE_ALLOC_HEAP
  696. job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  697. if(job==NULL){
  698. fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
  699. exit(1);
  700. }
  701. #endif
  702. newarg.common = (void *)job;
  703. if (!range_m) {
  704. range_M[0] = 0;
  705. m = args -> m;
  706. } else {
  707. range_M[0] = range_m[0];
  708. m = range_m[1] - range_m[0];
  709. }
  710. num_cpu_m = 0;
  711. while (m > 0){
  712. width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m);
  713. m -= width;
  714. if (m < 0) width = width + m;
  715. range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width;
  716. num_cpu_m ++;
  717. }
  718. for (i = 0; i < num_cpu_m; i++) {
  719. queue[i].mode = mode;
  720. queue[i].routine = inner_thread;
  721. queue[i].args = &newarg;
  722. queue[i].range_m = &range_M[i];
  723. queue[i].range_n = &range_N[0];
  724. queue[i].sa = NULL;
  725. queue[i].sb = NULL;
  726. queue[i].next = &queue[i + 1];
  727. }
  728. queue[0].sa = sa;
  729. queue[0].sb = sb;
  730. if (!range_n) {
  731. n_from = 0;
  732. n_to = args -> n;
  733. } else {
  734. n_from = range_n[0];
  735. n_to = range_n[1];
  736. }
  737. for(js = n_from; js < n_to; js += GEMM_R * nthreads){
  738. n = n_to - js;
  739. if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
  740. range_N[0] = js;
  741. num_cpu_n = 0;
  742. while (n > 0){
  743. width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n);
  744. n -= width;
  745. if (n < 0) width = width + n;
  746. range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;
  747. num_cpu_n ++;
  748. }
  749. for (j = 0; j < num_cpu_m; j++) {
  750. for (i = 0; i < num_cpu_m; i++) {
  751. for (k = 0; k < DIVIDE_RATE; k++) {
  752. job[j].working[i][CACHE_LINE_SIZE * k] = 0;
  753. }
  754. }
  755. }
  756. queue[num_cpu_m - 1].next = NULL;
  757. exec_blas(num_cpu_m, queue);
  758. }
  759. #ifdef USE_ALLOC_HEAP
  760. free(job);
  761. #endif
  762. #ifndef USE_OPENMP
  763. #ifndef OS_WINDOWS
  764. pthread_mutex_unlock(&level3_lock);
  765. #else
  766. LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
  767. #endif
  768. #endif
  769. return 0;
  770. }
  771. int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  772. BLASLONG m = args -> m;
  773. // BLASLONG n = args -> n;
  774. BLASLONG nthreads = args -> nthreads;
  775. BLASLONG divN, divT;
  776. int mode;
  777. #if defined(DYNAMIC_ARCH)
  778. int switch_ratio = gotoblas->switch_ratio;
  779. #else
  780. int switch_ratio = SWITCH_RATIO;
  781. #endif
  782. if (range_m) {
  783. BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
  784. BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
  785. m = m_to - m_from;
  786. }
  787. /*
  788. if (range_n) {
  789. BLASLONG n_from = *(((BLASLONG *)range_n) + 0);
  790. BLASLONG n_to = *(((BLASLONG *)range_n) + 1);
  791. n = n_to - n_from;
  792. }
  793. */
  794. if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) {
  795. GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
  796. return 0;
  797. }
  798. divT = nthreads;
  799. divN = 1;
  800. while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) {
  801. do {
  802. divT --;
  803. divN = 1;
  804. while (divT * divN < nthreads) divN ++;
  805. } while ((divT * divN != nthreads) && (divT > 1));
  806. }
  807. args -> nthreads = divT;
  808. if (divN == 1){
  809. gemm_driver(args, range_m, range_n, sa, sb, 0);
  810. } else {
  811. #ifdef XDOUBLE
  812. mode = BLAS_XDOUBLE | BLAS_COMPLEX;
  813. #elif defined(DOUBLE)
  814. mode = BLAS_DOUBLE | BLAS_COMPLEX;
  815. #else
  816. mode = BLAS_SINGLE | BLAS_COMPLEX;
  817. #endif
  818. #if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \
  819. defined(CN) || defined(CT) || defined(CR) || defined(CC)
  820. mode |= (BLAS_TRANSA_T);
  821. #endif
  822. #if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \
  823. defined(NC) || defined(TC) || defined(RC) || defined(CC)
  824. mode |= (BLAS_TRANSB_T);
  825. #endif
  826. gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
  827. }
  828. return 0;
  829. }