You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

level3_gemm3m_thread.c 29 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #ifndef CACHE_LINE_SIZE
  39. #define CACHE_LINE_SIZE 8
  40. #endif
  41. #ifndef DIVIDE_RATE
  42. #define DIVIDE_RATE 2
  43. #endif
  44. #ifndef SWITCH_RATIO
  45. #define SWITCH_RATIO 2
  46. #endif
  47. //The array of job_t may overflow the stack.
  48. //Instead, use malloc to alloc job_t.
  49. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
  50. #define USE_ALLOC_HEAP
  51. #endif
  52. #ifndef GEMM3M_LOCAL
  53. #if defined(NN)
  54. #define GEMM3M_LOCAL GEMM3M_NN
  55. #elif defined(NT)
  56. #define GEMM3M_LOCAL GEMM3M_NT
  57. #elif defined(NR)
  58. #define GEMM3M_LOCAL GEMM3M_NR
  59. #elif defined(NC)
  60. #define GEMM3M_LOCAL GEMM3M_NC
  61. #elif defined(TN)
  62. #define GEMM3M_LOCAL GEMM3M_TN
  63. #elif defined(TT)
  64. #define GEMM3M_LOCAL GEMM3M_TT
  65. #elif defined(TR)
  66. #define GEMM3M_LOCAL GEMM3M_TR
  67. #elif defined(TC)
  68. #define GEMM3M_LOCAL GEMM3M_TC
  69. #elif defined(RN)
  70. #define GEMM3M_LOCAL GEMM3M_RN
  71. #elif defined(RT)
  72. #define GEMM3M_LOCAL GEMM3M_RT
  73. #elif defined(RR)
  74. #define GEMM3M_LOCAL GEMM3M_RR
  75. #elif defined(RC)
  76. #define GEMM3M_LOCAL GEMM3M_RC
  77. #elif defined(CN)
  78. #define GEMM3M_LOCAL GEMM3M_CN
  79. #elif defined(CT)
  80. #define GEMM3M_LOCAL GEMM3M_CT
  81. #elif defined(CR)
  82. #define GEMM3M_LOCAL GEMM3M_CR
  83. #elif defined(CC)
  84. #define GEMM3M_LOCAL GEMM3M_CC
  85. #endif
  86. #endif
  87. typedef struct {
  88. #if __STDC_VERSION__ >= 201112L
  89. _Atomic
  90. #else
  91. volatile
  92. #endif
  93. BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
  94. } job_t;
  95. #ifndef BETA_OPERATION
  96. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  97. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  98. BETA[0], BETA[1], NULL, 0, NULL, 0, \
  99. (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
  100. #endif
  101. #ifndef ICOPYB_OPERATION
  102. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  103. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  104. #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  105. GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  106. #else
  107. #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  108. GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  109. #endif
  110. #endif
  111. #ifndef ICOPYR_OPERATION
  112. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  113. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  114. #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  115. GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  116. #else
  117. #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  118. GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  119. #endif
  120. #endif
  121. #ifndef ICOPYI_OPERATION
  122. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  123. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  124. #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  125. GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  126. #else
  127. #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  128. GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  129. #endif
  130. #endif
  131. #ifndef OCOPYB_OPERATION
  132. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  133. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  134. #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  135. GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  136. #else
  137. #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  138. GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  139. #endif
  140. #endif
  141. #ifndef OCOPYR_OPERATION
  142. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  143. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  144. #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  145. GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  146. #else
  147. #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  148. GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  149. #endif
  150. #endif
  151. #ifndef OCOPYI_OPERATION
  152. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  153. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  154. #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  155. GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  156. #else
  157. #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  158. GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  159. #endif
  160. #endif
  161. #ifndef KERNEL_FUNC
  162. #define KERNEL_FUNC GEMM3M_KERNEL
  163. #endif
  164. #ifndef KERNEL_OPERATION
  165. #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \
  166. KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  167. #endif
  168. #ifndef A
  169. #define A args -> a
  170. #endif
  171. #ifndef LDA
  172. #define LDA args -> lda
  173. #endif
  174. #ifndef B
  175. #define B args -> b
  176. #endif
  177. #ifndef LDB
  178. #define LDB args -> ldb
  179. #endif
  180. #ifndef C
  181. #define C args -> c
  182. #endif
  183. #ifndef LDC
  184. #define LDC args -> ldc
  185. #endif
  186. #ifndef M
  187. #define M args -> m
  188. #endif
  189. #ifndef N
  190. #define N args -> n
  191. #endif
  192. #ifndef K
  193. #define K args -> k
  194. #endif
  195. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  196. #define ALPHA1 ONE
  197. #define ALPHA2 ONE
  198. #define ALPHA5 ZERO
  199. #define ALPHA6 ONE
  200. #define ALPHA7 ONE
  201. #define ALPHA8 ZERO
  202. #define ALPHA11 ONE
  203. #define ALPHA12 -ONE
  204. #define ALPHA13 ZERO
  205. #define ALPHA14 ONE
  206. #define ALPHA17 -ONE
  207. #define ALPHA18 -ONE
  208. #endif
  209. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  210. #define ALPHA1 ONE
  211. #define ALPHA2 ONE
  212. #define ALPHA5 ONE
  213. #define ALPHA6 ZERO
  214. #define ALPHA7 ZERO
  215. #define ALPHA8 ONE
  216. #define ALPHA11 -ONE
  217. #define ALPHA12 -ONE
  218. #define ALPHA13 ONE
  219. #define ALPHA14 ZERO
  220. #define ALPHA17 -ONE
  221. #define ALPHA18 ONE
  222. #endif
  223. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  224. #define ALPHA1 ONE
  225. #define ALPHA2 ONE
  226. #define ALPHA5 ONE
  227. #define ALPHA6 ZERO
  228. #define ALPHA7 ZERO
  229. #define ALPHA8 ONE
  230. #define ALPHA11 -ONE
  231. #define ALPHA12 ONE
  232. #define ALPHA13 ONE
  233. #define ALPHA14 ZERO
  234. #define ALPHA17 -ONE
  235. #define ALPHA18 -ONE
  236. #endif
  237. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  238. #define ALPHA1 ONE
  239. #define ALPHA2 ONE
  240. #define ALPHA5 ZERO
  241. #define ALPHA6 -ONE
  242. #define ALPHA7 ONE
  243. #define ALPHA8 ZERO
  244. #define ALPHA11 ONE
  245. #define ALPHA12 ONE
  246. #define ALPHA13 ZERO
  247. #define ALPHA14 ONE
  248. #define ALPHA17 -ONE
  249. #define ALPHA18 ONE
  250. #endif
  251. #ifdef TIMING
  252. #define START_RPCC() rpcc_counter = rpcc()
  253. #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
  254. #else
  255. #define START_RPCC()
  256. #define STOP_RPCC(COUNTER)
  257. #endif
  258. static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  259. BLASLONG k, lda, ldb, ldc;
  260. BLASLONG m_from, m_to, n_from, n_to, N_from, N_to;
  261. FLOAT *alpha, *beta;
  262. FLOAT *a, *b, *c;
  263. job_t *job = (job_t *)args -> common;
  264. BLASLONG xxx, bufferside;
  265. FLOAT *buffer[DIVIDE_RATE];
  266. BLASLONG ls, min_l, jjs, min_jj;
  267. BLASLONG is, min_i, div_n;
  268. BLASLONG i, current;
  269. #ifdef TIMING
  270. BLASLONG rpcc_counter;
  271. BLASLONG copy_A = 0;
  272. BLASLONG copy_B = 0;
  273. BLASLONG kernel = 0;
  274. BLASLONG waiting1 = 0;
  275. BLASLONG waiting2 = 0;
  276. BLASLONG waiting3 = 0;
  277. BLASLONG waiting6[MAX_CPU_NUMBER];
  278. BLASLONG ops = 0;
  279. for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
  280. #endif
  281. k = K;
  282. a = (FLOAT *)A;
  283. b = (FLOAT *)B;
  284. c = (FLOAT *)C;
  285. lda = LDA;
  286. ldb = LDB;
  287. ldc = LDC;
  288. alpha = (FLOAT *)args -> alpha;
  289. beta = (FLOAT *)args -> beta;
  290. m_from = 0;
  291. m_to = M;
  292. if (range_m) {
  293. m_from = range_m[0];
  294. m_to = range_m[1];
  295. }
  296. n_from = 0;
  297. n_to = N;
  298. N_from = 0;
  299. N_to = N;
  300. if (range_n) {
  301. n_from = range_n[mypos + 0];
  302. n_to = range_n[mypos + 1];
  303. N_from = range_n[0];
  304. N_to = range_n[args -> nthreads];
  305. }
  306. if (beta) {
  307. if ((beta[0] != ONE) || (beta[1] != ZERO))
  308. BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc);
  309. }
  310. if ((k == 0) || (alpha == NULL)) return 0;
  311. if ((alpha[0] == ZERO) && (alpha[1] == ZERO)) return 0;
  312. #if 0
  313. fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
  314. mypos, m_from, m_to, n_from, n_to, N_from, N_to);
  315. #endif
  316. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  317. buffer[0] = sb;
  318. for (i = 1; i < DIVIDE_RATE; i++) {
  319. buffer[i] = buffer[i - 1] + GEMM3M_Q * (((div_n + GEMM3M_UNROLL_N - 1)/GEMM3M_UNROLL_N) * GEMM3M_UNROLL_N);
  320. }
  321. for(ls = 0; ls < k; ls += min_l){
  322. min_l = k - ls;
  323. if (min_l >= GEMM3M_Q * 2) {
  324. min_l = GEMM3M_Q;
  325. } else {
  326. if (min_l > GEMM3M_Q) {
  327. min_l = (min_l + 1) / 2;
  328. }
  329. }
  330. min_i = m_to - m_from;
  331. if (min_i >= GEMM3M_P * 2) {
  332. min_i = GEMM3M_P;
  333. } else {
  334. if (min_i > GEMM3M_P) {
  335. min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  336. }
  337. }
  338. START_RPCC();
  339. ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  340. STOP_RPCC(copy_A);
  341. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  342. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  343. START_RPCC();
  344. /* Make sure if no one is using another buffer */
  345. for (i = 0; i < args -> nthreads; i++)
  346. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
  347. STOP_RPCC(waiting1);
  348. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  349. min_jj = MIN(n_to, xxx + div_n) - jjs;
  350. if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
  351. START_RPCC();
  352. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
  353. OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  354. #else
  355. OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  356. #endif
  357. STOP_RPCC(copy_B);
  358. START_RPCC();
  359. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
  360. sa, buffer[bufferside] + min_l * (jjs - xxx),
  361. c, ldc, m_from, jjs);
  362. STOP_RPCC(kernel);
  363. #ifdef TIMING
  364. ops += 2 * min_i * min_jj * min_l;
  365. #endif
  366. }
  367. for (i = 0; i < args -> nthreads; i++)
  368. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  369. WMB;
  370. }
  371. current = mypos;
  372. do {
  373. current ++;
  374. if (current >= args -> nthreads) current = 0;
  375. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  376. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  377. if (current != mypos) {
  378. START_RPCC();
  379. /* thread has to wait */
  380. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
  381. STOP_RPCC(waiting2);
  382. START_RPCC();
  383. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
  384. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  385. c, ldc, m_from, xxx);
  386. STOP_RPCC(kernel);
  387. #ifdef TIMING
  388. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  389. #endif
  390. }
  391. if (m_to - m_from == min_i) {
  392. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  393. WMB;
  394. }
  395. }
  396. } while (current != mypos);
  397. for(is = m_from + min_i; is < m_to; is += min_i){
  398. min_i = m_to - is;
  399. if (min_i >= GEMM3M_P * 2) {
  400. min_i = GEMM3M_P;
  401. } else
  402. if (min_i > GEMM3M_P) {
  403. min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  404. }
  405. START_RPCC();
  406. ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  407. STOP_RPCC(copy_A);
  408. current = mypos;
  409. do {
  410. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  411. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  412. START_RPCC();
  413. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
  414. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  415. c, ldc, is, xxx);
  416. STOP_RPCC(kernel);
  417. #ifdef TIMING
  418. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  419. #endif
  420. if (is + min_i >= m_to) {
  421. /* Thread doesn't need this buffer any more */
  422. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  423. WMB;
  424. }
  425. }
  426. current ++;
  427. if (current >= args -> nthreads) current = 0;
  428. } while (current != mypos);
  429. } /* end of is */
  430. START_RPCC();
  431. ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  432. STOP_RPCC(copy_A);
  433. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  434. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  435. START_RPCC();
  436. /* Make sure if no one is using another buffer */
  437. for (i = 0; i < args -> nthreads; i++)
  438. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
  439. STOP_RPCC(waiting1);
  440. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  441. min_jj = MIN(n_to, xxx + div_n) - jjs;
  442. if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
  443. START_RPCC();
  444. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  445. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  446. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  447. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  448. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  449. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  450. #else
  451. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  452. #endif
  453. STOP_RPCC(copy_B);
  454. START_RPCC();
  455. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
  456. sa, buffer[bufferside] + min_l * (jjs - xxx),
  457. c, ldc, m_from, jjs);
  458. STOP_RPCC(kernel);
  459. #ifdef TIMING
  460. ops += 2 * min_i * min_jj * min_l;
  461. #endif
  462. }
  463. for (i = 0; i < args -> nthreads; i++)
  464. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  465. }
  466. current = mypos;
  467. do {
  468. current ++;
  469. if (current >= args -> nthreads) current = 0;
  470. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  471. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  472. if (current != mypos) {
  473. START_RPCC();
  474. /* thread has to wait */
  475. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
  476. STOP_RPCC(waiting2);
  477. START_RPCC();
  478. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
  479. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  480. c, ldc, m_from, xxx);
  481. STOP_RPCC(kernel);
  482. #ifdef TIMING
  483. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  484. #endif
  485. }
  486. if (m_to - m_from == min_i) {
  487. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  488. WMB;
  489. }
  490. }
  491. } while (current != mypos);
  492. for(is = m_from + min_i; is < m_to; is += min_i){
  493. min_i = m_to - is;
  494. if (min_i >= GEMM3M_P * 2) {
  495. min_i = GEMM3M_P;
  496. } else
  497. if (min_i > GEMM3M_P) {
  498. min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  499. }
  500. START_RPCC();
  501. ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  502. STOP_RPCC(copy_A);
  503. current = mypos;
  504. do {
  505. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  506. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  507. START_RPCC();
  508. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
  509. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  510. c, ldc, is, xxx);
  511. STOP_RPCC(kernel);
  512. #ifdef TIMING
  513. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  514. #endif
  515. if (is + min_i >= m_to) {
  516. /* Thread doesn't need this buffer any more */
  517. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  518. }
  519. }
  520. current ++;
  521. if (current >= args -> nthreads) current = 0;
  522. } while (current != mypos);
  523. } /* end of is */
  524. START_RPCC();
  525. ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  526. STOP_RPCC(copy_A);
  527. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  528. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  529. START_RPCC();
  530. /* Make sure if no one is using another buffer */
  531. for (i = 0; i < args -> nthreads; i++)
  532. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
  533. STOP_RPCC(waiting1);
  534. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  535. min_jj = MIN(n_to, xxx + div_n) - jjs;
  536. if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
  537. START_RPCC();
  538. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  539. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  540. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  541. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  542. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  543. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  544. #else
  545. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  546. #endif
  547. STOP_RPCC(copy_B);
  548. START_RPCC();
  549. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
  550. sa, buffer[bufferside] + min_l * (jjs - xxx),
  551. c, ldc, m_from, jjs);
  552. STOP_RPCC(kernel);
  553. #ifdef TIMING
  554. ops += 2 * min_i * min_jj * min_l;
  555. #endif
  556. }
  557. for (i = 0; i < args -> nthreads; i++)
  558. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  559. }
  560. current = mypos;
  561. do {
  562. current ++;
  563. if (current >= args -> nthreads) current = 0;
  564. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  565. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  566. if (current != mypos) {
  567. START_RPCC();
  568. /* thread has to wait */
  569. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
  570. STOP_RPCC(waiting2);
  571. START_RPCC();
  572. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
  573. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  574. c, ldc, m_from, xxx);
  575. STOP_RPCC(kernel);
  576. #ifdef TIMING
  577. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  578. #endif
  579. }
  580. if (m_to - m_from == min_i) {
  581. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  582. WMB;
  583. }
  584. }
  585. } while (current != mypos);
  586. for(is = m_from + min_i; is < m_to; is += min_i){
  587. min_i = m_to - is;
  588. if (min_i >= GEMM3M_P * 2) {
  589. min_i = GEMM3M_P;
  590. } else
  591. if (min_i > GEMM3M_P) {
  592. min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  593. }
  594. START_RPCC();
  595. ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  596. STOP_RPCC(copy_A);
  597. current = mypos;
  598. do {
  599. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  600. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  601. START_RPCC();
  602. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
  603. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  604. c, ldc, is, xxx);
  605. STOP_RPCC(kernel);
  606. #ifdef TIMING
  607. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  608. #endif
  609. if (is + min_i >= m_to) {
  610. /* Thread doesn't need this buffer any more */
  611. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
  612. WMB;
  613. }
  614. }
  615. current ++;
  616. if (current >= args -> nthreads) current = 0;
  617. } while (current != mypos);
  618. } /* end of is */
  619. }
  620. START_RPCC();
  621. for (i = 0; i < args -> nthreads; i++) {
  622. for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
  623. while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
  624. }
  625. }
  626. STOP_RPCC(waiting3);
  627. #ifdef TIMING
  628. BLASLONG waiting = waiting1 + waiting2 + waiting3;
  629. BLASLONG total = copy_A + copy_B + kernel + waiting;
  630. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait : %6.2f Kernel : %6.2f\n",
  631. mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
  632. (double)waiting /(double)total * 100.,
  633. (double)ops/(double)kernel / 2. * 100.);
  634. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n",
  635. mypos, copy_A, copy_B, waiting);
  636. #if 0
  637. fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
  638. mypos,
  639. (double)waiting1/(double)waiting * 100.,
  640. (double)waiting2/(double)waiting * 100.,
  641. (double)waiting3/(double)waiting * 100.);
  642. #endif
  643. fprintf(stderr, "\n");
  644. #endif
  645. return 0;
  646. }
  647. static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  648. *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  649. #ifndef USE_OPENMP
  650. #ifndef OS_WINDOWS
  651. static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
  652. #else
  653. CRITICAL_SECTION level3_lock;
  654. InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
  655. #endif
  656. #endif
  657. blas_arg_t newarg;
  658. blas_queue_t queue[MAX_CPU_NUMBER];
  659. BLASLONG range_M[MAX_CPU_NUMBER + 1];
  660. BLASLONG range_N[MAX_CPU_NUMBER + 1];
  661. #ifndef USE_ALLOC_HEAP
  662. job_t job[MAX_CPU_NUMBER];
  663. #else
  664. job_t * job = NULL;
  665. #endif
  666. BLASLONG num_cpu_m, num_cpu_n;
  667. BLASLONG nthreads = args -> nthreads;
  668. BLASLONG width, i, j, k, js;
  669. BLASLONG m, n, n_from, n_to;
  670. int mode;
  671. #ifdef XDOUBLE
  672. mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
  673. #elif defined(DOUBLE)
  674. mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
  675. #else
  676. mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
  677. #endif
  678. #ifndef USE_OPENMP
  679. #ifndef OS_WINDOWS
  680. pthread_mutex_lock(&level3_lock);
  681. #else
  682. EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
  683. #endif
  684. #endif
  685. newarg.m = args -> m;
  686. newarg.n = args -> n;
  687. newarg.k = args -> k;
  688. newarg.a = args -> a;
  689. newarg.b = args -> b;
  690. newarg.c = args -> c;
  691. newarg.lda = args -> lda;
  692. newarg.ldb = args -> ldb;
  693. newarg.ldc = args -> ldc;
  694. newarg.alpha = args -> alpha;
  695. newarg.beta = args -> beta;
  696. newarg.nthreads = args -> nthreads;
  697. #ifdef USE_ALLOC_HEAP
  698. job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  699. if(job==NULL){
  700. fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
  701. exit(1);
  702. }
  703. #endif
  704. newarg.common = (void *)job;
  705. if (!range_m) {
  706. range_M[0] = 0;
  707. m = args -> m;
  708. } else {
  709. range_M[0] = range_m[0];
  710. m = range_m[1] - range_m[0];
  711. }
  712. num_cpu_m = 0;
  713. while (m > 0){
  714. width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m);
  715. m -= width;
  716. if (m < 0) width = width + m;
  717. range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width;
  718. num_cpu_m ++;
  719. }
  720. for (i = 0; i < num_cpu_m; i++) {
  721. queue[i].mode = mode;
  722. queue[i].routine = inner_thread;
  723. queue[i].args = &newarg;
  724. queue[i].range_m = &range_M[i];
  725. queue[i].range_n = &range_N[0];
  726. queue[i].sa = NULL;
  727. queue[i].sb = NULL;
  728. queue[i].next = &queue[i + 1];
  729. }
  730. queue[0].sa = sa;
  731. queue[0].sb = sb;
  732. if (!range_n) {
  733. n_from = 0;
  734. n_to = args -> n;
  735. } else {
  736. n_from = range_n[0];
  737. n_to = range_n[1];
  738. }
  739. for(js = n_from; js < n_to; js += GEMM_R * nthreads){
  740. n = n_to - js;
  741. if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
  742. range_N[0] = js;
  743. num_cpu_n = 0;
  744. while (n > 0){
  745. width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n);
  746. n -= width;
  747. if (n < 0) width = width + n;
  748. range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;
  749. num_cpu_n ++;
  750. }
  751. for (j = 0; j < num_cpu_m; j++) {
  752. for (i = 0; i < num_cpu_m; i++) {
  753. for (k = 0; k < DIVIDE_RATE; k++) {
  754. job[j].working[i][CACHE_LINE_SIZE * k] = 0;
  755. }
  756. }
  757. }
  758. queue[num_cpu_m - 1].next = NULL;
  759. exec_blas(num_cpu_m, queue);
  760. }
  761. #ifdef USE_ALLOC_HEAP
  762. free(job);
  763. #endif
  764. #ifndef USE_OPENMP
  765. #ifndef OS_WINDOWS
  766. pthread_mutex_unlock(&level3_lock);
  767. #else
  768. LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
  769. #endif
  770. #endif
  771. return 0;
  772. }
  773. int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  774. BLASLONG m = args -> m;
  775. // BLASLONG n = args -> n;
  776. BLASLONG nthreads = args -> nthreads;
  777. BLASLONG divN, divT;
  778. int mode;
  779. if (range_m) {
  780. BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
  781. BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
  782. m = m_to - m_from;
  783. }
  784. /*
  785. if (range_n) {
  786. BLASLONG n_from = *(((BLASLONG *)range_n) + 0);
  787. BLASLONG n_to = *(((BLASLONG *)range_n) + 1);
  788. n = n_to - n_from;
  789. }
  790. */
  791. if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
  792. GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
  793. return 0;
  794. }
  795. divT = nthreads;
  796. divN = 1;
  797. while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
  798. do {
  799. divT --;
  800. divN = 1;
  801. while (divT * divN < nthreads) divN ++;
  802. } while ((divT * divN != nthreads) && (divT > 1));
  803. }
  804. args -> nthreads = divT;
  805. if (divN == 1){
  806. gemm_driver(args, range_m, range_n, sa, sb, 0);
  807. } else {
  808. #ifdef XDOUBLE
  809. mode = BLAS_XDOUBLE | BLAS_COMPLEX;
  810. #elif defined(DOUBLE)
  811. mode = BLAS_DOUBLE | BLAS_COMPLEX;
  812. #else
  813. mode = BLAS_SINGLE | BLAS_COMPLEX;
  814. #endif
  815. #if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \
  816. defined(CN) || defined(CT) || defined(CR) || defined(CC)
  817. mode |= (BLAS_TRANSA_T);
  818. #endif
  819. #if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \
  820. defined(NC) || defined(TC) || defined(RC) || defined(CC)
  821. mode |= (BLAS_TRANSB_T);
  822. #endif
  823. gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
  824. }
  825. return 0;
  826. }