You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

level3_gemm3m_thread.c 28 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #ifndef CACHE_LINE_SIZE
  39. #define CACHE_LINE_SIZE 8
  40. #endif
  41. #ifndef DIVIDE_RATE
  42. #define DIVIDE_RATE 2
  43. #endif
  44. #ifndef SWITCH_RATIO
  45. #define SWITCH_RATIO 2
  46. #endif
  47. //The array of job_t may overflow the stack.
  48. //Instead, use malloc to alloc job_t.
  49. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
  50. #define USE_ALLOC_HEAP
  51. #endif
  52. #ifndef GEMM3M_LOCAL
  53. #if defined(NN)
  54. #define GEMM3M_LOCAL GEMM3M_NN
  55. #elif defined(NT)
  56. #define GEMM3M_LOCAL GEMM3M_NT
  57. #elif defined(NR)
  58. #define GEMM3M_LOCAL GEMM3M_NR
  59. #elif defined(NC)
  60. #define GEMM3M_LOCAL GEMM3M_NC
  61. #elif defined(TN)
  62. #define GEMM3M_LOCAL GEMM3M_TN
  63. #elif defined(TT)
  64. #define GEMM3M_LOCAL GEMM3M_TT
  65. #elif defined(TR)
  66. #define GEMM3M_LOCAL GEMM3M_TR
  67. #elif defined(TC)
  68. #define GEMM3M_LOCAL GEMM3M_TC
  69. #elif defined(RN)
  70. #define GEMM3M_LOCAL GEMM3M_RN
  71. #elif defined(RT)
  72. #define GEMM3M_LOCAL GEMM3M_RT
  73. #elif defined(RR)
  74. #define GEMM3M_LOCAL GEMM3M_RR
  75. #elif defined(RC)
  76. #define GEMM3M_LOCAL GEMM3M_RC
  77. #elif defined(CN)
  78. #define GEMM3M_LOCAL GEMM3M_CN
  79. #elif defined(CT)
  80. #define GEMM3M_LOCAL GEMM3M_CT
  81. #elif defined(CR)
  82. #define GEMM3M_LOCAL GEMM3M_CR
  83. #elif defined(CC)
  84. #define GEMM3M_LOCAL GEMM3M_CC
  85. #endif
  86. #endif
  87. typedef struct {
  88. #if __STDC_VERSION__ >= 201112L
  89. _Atomic
  90. #else
  91. volatile
  92. #endif
  93. BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
  94. } job_t;
  95. #ifndef BETA_OPERATION
  96. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  97. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  98. BETA[0], BETA[1], NULL, 0, NULL, 0, \
  99. (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
  100. #endif
  101. #ifndef ICOPYB_OPERATION
  102. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  103. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  104. #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  105. GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  106. #else
  107. #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  108. GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  109. #endif
  110. #endif
  111. #ifndef ICOPYR_OPERATION
  112. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  113. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  114. #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  115. GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  116. #else
  117. #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  118. GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  119. #endif
  120. #endif
  121. #ifndef ICOPYI_OPERATION
  122. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  123. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  124. #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  125. GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  126. #else
  127. #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  128. GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  129. #endif
  130. #endif
  131. #ifndef OCOPYB_OPERATION
  132. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  133. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  134. #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  135. GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  136. #else
  137. #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  138. GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  139. #endif
  140. #endif
  141. #ifndef OCOPYR_OPERATION
  142. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  143. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  144. #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  145. GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  146. #else
  147. #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  148. GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  149. #endif
  150. #endif
  151. #ifndef OCOPYI_OPERATION
  152. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  153. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  154. #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  155. GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  156. #else
  157. #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  158. GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  159. #endif
  160. #endif
  161. #ifndef KERNEL_FUNC
  162. #define KERNEL_FUNC GEMM3M_KERNEL
  163. #endif
  164. #ifndef KERNEL_OPERATION
  165. #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \
  166. KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  167. #endif
  168. #ifndef A
  169. #define A args -> a
  170. #endif
  171. #ifndef LDA
  172. #define LDA args -> lda
  173. #endif
  174. #ifndef B
  175. #define B args -> b
  176. #endif
  177. #ifndef LDB
  178. #define LDB args -> ldb
  179. #endif
  180. #ifndef C
  181. #define C args -> c
  182. #endif
  183. #ifndef LDC
  184. #define LDC args -> ldc
  185. #endif
  186. #ifndef M
  187. #define M args -> m
  188. #endif
  189. #ifndef N
  190. #define N args -> n
  191. #endif
  192. #ifndef K
  193. #define K args -> k
  194. #endif
  195. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  196. #define ALPHA1 ONE
  197. #define ALPHA2 ONE
  198. #define ALPHA5 ZERO
  199. #define ALPHA6 ONE
  200. #define ALPHA7 ONE
  201. #define ALPHA8 ZERO
  202. #define ALPHA11 ONE
  203. #define ALPHA12 -ONE
  204. #define ALPHA13 ZERO
  205. #define ALPHA14 ONE
  206. #define ALPHA17 -ONE
  207. #define ALPHA18 -ONE
  208. #endif
  209. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  210. #define ALPHA1 ONE
  211. #define ALPHA2 ONE
  212. #define ALPHA5 ONE
  213. #define ALPHA6 ZERO
  214. #define ALPHA7 ZERO
  215. #define ALPHA8 ONE
  216. #define ALPHA11 -ONE
  217. #define ALPHA12 -ONE
  218. #define ALPHA13 ONE
  219. #define ALPHA14 ZERO
  220. #define ALPHA17 -ONE
  221. #define ALPHA18 ONE
  222. #endif
  223. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  224. #define ALPHA1 ONE
  225. #define ALPHA2 ONE
  226. #define ALPHA5 ONE
  227. #define ALPHA6 ZERO
  228. #define ALPHA7 ZERO
  229. #define ALPHA8 ONE
  230. #define ALPHA11 -ONE
  231. #define ALPHA12 ONE
  232. #define ALPHA13 ONE
  233. #define ALPHA14 ZERO
  234. #define ALPHA17 -ONE
  235. #define ALPHA18 -ONE
  236. #endif
  237. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  238. #define ALPHA1 ONE
  239. #define ALPHA2 ONE
  240. #define ALPHA5 ZERO
  241. #define ALPHA6 -ONE
  242. #define ALPHA7 ONE
  243. #define ALPHA8 ZERO
  244. #define ALPHA11 ONE
  245. #define ALPHA12 ONE
  246. #define ALPHA13 ZERO
  247. #define ALPHA14 ONE
  248. #define ALPHA17 -ONE
  249. #define ALPHA18 ONE
  250. #endif
  251. #ifdef TIMING
  252. #define START_RPCC() rpcc_counter = rpcc()
  253. #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
  254. #else
  255. #define START_RPCC()
  256. #define STOP_RPCC(COUNTER)
  257. #endif
  258. static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  259. BLASLONG k, lda, ldb, ldc;
  260. BLASLONG m_from, m_to, n_from, n_to, N_from, N_to;
  261. FLOAT *alpha, *beta;
  262. FLOAT *a, *b, *c;
  263. job_t *job = (job_t *)args -> common;
  264. BLASLONG xxx, bufferside;
  265. FLOAT *buffer[DIVIDE_RATE];
  266. BLASLONG ls, min_l, jjs, min_jj;
  267. BLASLONG is, min_i, div_n;
  268. BLASLONG i, current;
  269. #ifdef TIMING
  270. BLASLONG rpcc_counter;
  271. BLASLONG copy_A = 0;
  272. BLASLONG copy_B = 0;
  273. BLASLONG kernel = 0;
  274. BLASLONG waiting1 = 0;
  275. BLASLONG waiting2 = 0;
  276. BLASLONG waiting3 = 0;
  277. BLASLONG waiting6[MAX_CPU_NUMBER];
  278. BLASLONG ops = 0;
  279. for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
  280. #endif
  281. k = K;
  282. a = (FLOAT *)A;
  283. b = (FLOAT *)B;
  284. c = (FLOAT *)C;
  285. lda = LDA;
  286. ldb = LDB;
  287. ldc = LDC;
  288. alpha = (FLOAT *)args -> alpha;
  289. beta = (FLOAT *)args -> beta;
  290. m_from = 0;
  291. m_to = M;
  292. if (range_m) {
  293. m_from = range_m[0];
  294. m_to = range_m[1];
  295. }
  296. n_from = 0;
  297. n_to = N;
  298. N_from = 0;
  299. N_to = N;
  300. if (range_n) {
  301. n_from = range_n[mypos + 0];
  302. n_to = range_n[mypos + 1];
  303. N_from = range_n[0];
  304. N_to = range_n[args -> nthreads];
  305. }
  306. if (beta) {
  307. if ((beta[0] != ONE) || (beta[1] != ZERO))
  308. BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc);
  309. }
  310. if ((k == 0) || (alpha == NULL)) return 0;
  311. if ((alpha[0] == ZERO) && (alpha[1] == ZERO)) return 0;
  312. #if 0
  313. fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
  314. mypos, m_from, m_to, n_from, n_to, N_from, N_to);
  315. #endif
  316. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  317. buffer[0] = sb;
  318. for (i = 1; i < DIVIDE_RATE; i++) {
  319. buffer[i] = buffer[i - 1] + GEMM3M_Q * (((div_n + GEMM3M_UNROLL_N - 1)/GEMM3M_UNROLL_N) * GEMM3M_UNROLL_N);
  320. }
  321. for(ls = 0; ls < k; ls += min_l){
  322. min_l = k - ls;
  323. if (min_l >= GEMM3M_Q * 2) {
  324. min_l = GEMM3M_Q;
  325. } else {
  326. if (min_l > GEMM3M_Q) {
  327. min_l = (min_l + 1) / 2;
  328. }
  329. }
  330. min_i = m_to - m_from;
  331. if (min_i >= GEMM3M_P * 2) {
  332. min_i = GEMM3M_P;
  333. } else {
  334. if (min_i > GEMM3M_P) {
  335. min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  336. }
  337. }
  338. START_RPCC();
  339. ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  340. STOP_RPCC(copy_A);
  341. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  342. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  343. START_RPCC();
  344. /* Make sure if no one is using another buffer */
  345. for (i = 0; i < args -> nthreads; i++)
  346. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
  347. STOP_RPCC(waiting1);
  348. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  349. min_jj = MIN(n_to, xxx + div_n) - jjs;
  350. if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
  351. START_RPCC();
  352. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
  353. OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  354. #else
  355. OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  356. #endif
  357. STOP_RPCC(copy_B);
  358. START_RPCC();
  359. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
  360. sa, buffer[bufferside] + min_l * (jjs - xxx),
  361. c, ldc, m_from, jjs);
  362. STOP_RPCC(kernel);
  363. #ifdef TIMING
  364. ops += 2 * min_i * min_jj * min_l;
  365. #endif
  366. }
  367. for (i = 0; i < args -> nthreads; i++)
  368. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  369. }
  370. current = mypos;
  371. do {
  372. current ++;
  373. if (current >= args -> nthreads) current = 0;
  374. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  375. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  376. if (current != mypos) {
  377. START_RPCC();
  378. /* thread has to wait */
  379. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
  380. STOP_RPCC(waiting2);
  381. START_RPCC();
  382. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
  383. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  384. c, ldc, m_from, xxx);
  385. STOP_RPCC(kernel);
  386. #ifdef TIMING
  387. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  388. #endif
  389. }
  390. if (m_to - m_from == min_i) {
  391. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  392. }
  393. }
  394. } while (current != mypos);
  395. for(is = m_from + min_i; is < m_to; is += min_i){
  396. min_i = m_to - is;
  397. if (min_i >= GEMM3M_P * 2) {
  398. min_i = GEMM3M_P;
  399. } else
  400. if (min_i > GEMM3M_P) {
  401. min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  402. }
  403. START_RPCC();
  404. ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  405. STOP_RPCC(copy_A);
  406. current = mypos;
  407. do {
  408. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  409. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  410. START_RPCC();
  411. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
  412. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  413. c, ldc, is, xxx);
  414. STOP_RPCC(kernel);
  415. #ifdef TIMING
  416. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  417. #endif
  418. if (is + min_i >= m_to) {
  419. /* Thread doesn't need this buffer any more */
  420. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  421. }
  422. }
  423. current ++;
  424. if (current >= args -> nthreads) current = 0;
  425. } while (current != mypos);
  426. } /* end of is */
  427. START_RPCC();
  428. ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  429. STOP_RPCC(copy_A);
  430. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  431. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  432. START_RPCC();
  433. /* Make sure if no one is using another buffer */
  434. for (i = 0; i < args -> nthreads; i++)
  435. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
  436. STOP_RPCC(waiting1);
  437. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  438. min_jj = MIN(n_to, xxx + div_n) - jjs;
  439. if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
  440. START_RPCC();
  441. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  442. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  443. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  444. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  445. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  446. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  447. #else
  448. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  449. #endif
  450. STOP_RPCC(copy_B);
  451. START_RPCC();
  452. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
  453. sa, buffer[bufferside] + min_l * (jjs - xxx),
  454. c, ldc, m_from, jjs);
  455. STOP_RPCC(kernel);
  456. #ifdef TIMING
  457. ops += 2 * min_i * min_jj * min_l;
  458. #endif
  459. }
  460. for (i = 0; i < args -> nthreads; i++)
  461. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  462. }
  463. current = mypos;
  464. do {
  465. current ++;
  466. if (current >= args -> nthreads) current = 0;
  467. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  468. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  469. if (current != mypos) {
  470. START_RPCC();
  471. /* thread has to wait */
  472. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
  473. STOP_RPCC(waiting2);
  474. START_RPCC();
  475. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
  476. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  477. c, ldc, m_from, xxx);
  478. STOP_RPCC(kernel);
  479. #ifdef TIMING
  480. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  481. #endif
  482. }
  483. if (m_to - m_from == min_i) {
  484. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  485. }
  486. }
  487. } while (current != mypos);
  488. for(is = m_from + min_i; is < m_to; is += min_i){
  489. min_i = m_to - is;
  490. if (min_i >= GEMM3M_P * 2) {
  491. min_i = GEMM3M_P;
  492. } else
  493. if (min_i > GEMM3M_P) {
  494. min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  495. }
  496. START_RPCC();
  497. ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  498. STOP_RPCC(copy_A);
  499. current = mypos;
  500. do {
  501. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  502. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  503. START_RPCC();
  504. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
  505. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  506. c, ldc, is, xxx);
  507. STOP_RPCC(kernel);
  508. #ifdef TIMING
  509. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  510. #endif
  511. if (is + min_i >= m_to) {
  512. /* Thread doesn't need this buffer any more */
  513. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  514. }
  515. }
  516. current ++;
  517. if (current >= args -> nthreads) current = 0;
  518. } while (current != mypos);
  519. } /* end of is */
  520. START_RPCC();
  521. ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  522. STOP_RPCC(copy_A);
  523. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  524. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  525. START_RPCC();
  526. /* Make sure if no one is using another buffer */
  527. for (i = 0; i < args -> nthreads; i++)
  528. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
  529. STOP_RPCC(waiting1);
  530. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  531. min_jj = MIN(n_to, xxx + div_n) - jjs;
  532. if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
  533. START_RPCC();
  534. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  535. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  536. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  537. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  538. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  539. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  540. #else
  541. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  542. #endif
  543. STOP_RPCC(copy_B);
  544. START_RPCC();
  545. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
  546. sa, buffer[bufferside] + min_l * (jjs - xxx),
  547. c, ldc, m_from, jjs);
  548. STOP_RPCC(kernel);
  549. #ifdef TIMING
  550. ops += 2 * min_i * min_jj * min_l;
  551. #endif
  552. }
  553. for (i = 0; i < args -> nthreads; i++)
  554. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  555. }
  556. current = mypos;
  557. do {
  558. current ++;
  559. if (current >= args -> nthreads) current = 0;
  560. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  561. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  562. if (current != mypos) {
  563. START_RPCC();
  564. /* thread has to wait */
  565. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
  566. STOP_RPCC(waiting2);
  567. START_RPCC();
  568. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
  569. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  570. c, ldc, m_from, xxx);
  571. STOP_RPCC(kernel);
  572. #ifdef TIMING
  573. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  574. #endif
  575. }
  576. if (m_to - m_from == min_i) {
  577. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  578. }
  579. }
  580. } while (current != mypos);
  581. for(is = m_from + min_i; is < m_to; is += min_i){
  582. min_i = m_to - is;
  583. if (min_i >= GEMM3M_P * 2) {
  584. min_i = GEMM3M_P;
  585. } else
  586. if (min_i > GEMM3M_P) {
  587. min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
  588. }
  589. START_RPCC();
  590. ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  591. STOP_RPCC(copy_A);
  592. current = mypos;
  593. do {
  594. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  595. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  596. START_RPCC();
  597. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
  598. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  599. c, ldc, is, xxx);
  600. STOP_RPCC(kernel);
  601. #ifdef TIMING
  602. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  603. #endif
  604. if (is + min_i >= m_to) {
  605. /* Thread doesn't need this buffer any more */
  606. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  607. }
  608. }
  609. current ++;
  610. if (current >= args -> nthreads) current = 0;
  611. } while (current != mypos);
  612. } /* end of is */
  613. }
  614. START_RPCC();
  615. for (i = 0; i < args -> nthreads; i++) {
  616. for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
  617. while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
  618. }
  619. }
  620. STOP_RPCC(waiting3);
  621. #ifdef TIMING
  622. BLASLONG waiting = waiting1 + waiting2 + waiting3;
  623. BLASLONG total = copy_A + copy_B + kernel + waiting;
  624. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait : %6.2f Kernel : %6.2f\n",
  625. mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
  626. (double)waiting /(double)total * 100.,
  627. (double)ops/(double)kernel / 2. * 100.);
  628. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n",
  629. mypos, copy_A, copy_B, waiting);
  630. #if 0
  631. fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
  632. mypos,
  633. (double)waiting1/(double)waiting * 100.,
  634. (double)waiting2/(double)waiting * 100.,
  635. (double)waiting3/(double)waiting * 100.);
  636. #endif
  637. fprintf(stderr, "\n");
  638. #endif
  639. return 0;
  640. }
  641. static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  642. *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  643. blas_arg_t newarg;
  644. blas_queue_t queue[MAX_CPU_NUMBER];
  645. BLASLONG range_M[MAX_CPU_NUMBER + 1];
  646. BLASLONG range_N[MAX_CPU_NUMBER + 1];
  647. #ifndef USE_ALLOC_HEAP
  648. job_t job[MAX_CPU_NUMBER];
  649. #else
  650. job_t * job = NULL;
  651. #endif
  652. BLASLONG num_cpu_m, num_cpu_n;
  653. BLASLONG nthreads = args -> nthreads;
  654. BLASLONG width, i, j, k, js;
  655. BLASLONG m, n, n_from, n_to;
  656. int mode;
  657. #ifdef XDOUBLE
  658. mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
  659. #elif defined(DOUBLE)
  660. mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
  661. #else
  662. mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
  663. #endif
  664. newarg.m = args -> m;
  665. newarg.n = args -> n;
  666. newarg.k = args -> k;
  667. newarg.a = args -> a;
  668. newarg.b = args -> b;
  669. newarg.c = args -> c;
  670. newarg.lda = args -> lda;
  671. newarg.ldb = args -> ldb;
  672. newarg.ldc = args -> ldc;
  673. newarg.alpha = args -> alpha;
  674. newarg.beta = args -> beta;
  675. newarg.nthreads = args -> nthreads;
  676. #ifdef USE_ALLOC_HEAP
  677. job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  678. if(job==NULL){
  679. fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
  680. exit(1);
  681. }
  682. #endif
  683. newarg.common = (void *)job;
  684. if (!range_m) {
  685. range_M[0] = 0;
  686. m = args -> m;
  687. } else {
  688. range_M[0] = range_m[0];
  689. m = range_m[1] - range_m[0];
  690. }
  691. num_cpu_m = 0;
  692. while (m > 0){
  693. width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m);
  694. m -= width;
  695. if (m < 0) width = width + m;
  696. range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width;
  697. num_cpu_m ++;
  698. }
  699. for (i = 0; i < num_cpu_m; i++) {
  700. queue[i].mode = mode;
  701. queue[i].routine = inner_thread;
  702. queue[i].args = &newarg;
  703. queue[i].range_m = &range_M[i];
  704. queue[i].range_n = &range_N[0];
  705. queue[i].sa = NULL;
  706. queue[i].sb = NULL;
  707. queue[i].next = &queue[i + 1];
  708. }
  709. queue[0].sa = sa;
  710. queue[0].sb = sb;
  711. if (!range_n) {
  712. n_from = 0;
  713. n_to = args -> n;
  714. } else {
  715. n_from = range_n[0];
  716. n_to = range_n[1];
  717. }
  718. for(js = n_from; js < n_to; js += GEMM_R * nthreads){
  719. n = n_to - js;
  720. if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
  721. range_N[0] = js;
  722. num_cpu_n = 0;
  723. while (n > 0){
  724. width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n);
  725. n -= width;
  726. if (n < 0) width = width + n;
  727. range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;
  728. num_cpu_n ++;
  729. }
  730. for (j = 0; j < num_cpu_m; j++) {
  731. for (i = 0; i < num_cpu_m; i++) {
  732. for (k = 0; k < DIVIDE_RATE; k++) {
  733. job[j].working[i][CACHE_LINE_SIZE * k] = 0;
  734. }
  735. }
  736. }
  737. queue[num_cpu_m - 1].next = NULL;
  738. exec_blas(num_cpu_m, queue);
  739. }
  740. #ifdef USE_ALLOC_HEAP
  741. free(job);
  742. #endif
  743. return 0;
  744. }
  745. int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  746. BLASLONG m = args -> m;
  747. // BLASLONG n = args -> n;
  748. BLASLONG nthreads = args -> nthreads;
  749. BLASLONG divN, divT;
  750. int mode;
  751. if (range_m) {
  752. BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
  753. BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
  754. m = m_to - m_from;
  755. }
  756. /*
  757. if (range_n) {
  758. BLASLONG n_from = *(((BLASLONG *)range_n) + 0);
  759. BLASLONG n_to = *(((BLASLONG *)range_n) + 1);
  760. n = n_to - n_from;
  761. }
  762. */
  763. if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
  764. GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
  765. return 0;
  766. }
  767. divT = nthreads;
  768. divN = 1;
  769. while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
  770. do {
  771. divT --;
  772. divN = 1;
  773. while (divT * divN < nthreads) divN ++;
  774. } while ((divT * divN != nthreads) && (divT > 1));
  775. }
  776. args -> nthreads = divT;
  777. if (divN == 1){
  778. gemm_driver(args, range_m, range_n, sa, sb, 0);
  779. } else {
  780. #ifdef XDOUBLE
  781. mode = BLAS_XDOUBLE | BLAS_COMPLEX;
  782. #elif defined(DOUBLE)
  783. mode = BLAS_DOUBLE | BLAS_COMPLEX;
  784. #else
  785. mode = BLAS_SINGLE | BLAS_COMPLEX;
  786. #endif
  787. #if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \
  788. defined(CN) || defined(CT) || defined(CR) || defined(CC)
  789. mode |= (BLAS_TRANSA_T);
  790. #endif
  791. #if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \
  792. defined(NC) || defined(TC) || defined(RC) || defined(CC)
  793. mode |= (BLAS_TRANSB_T);
  794. #endif
  795. gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
  796. }
  797. return 0;
  798. }