You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

level3_gemm3m_thread.c 29 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #ifndef CACHE_LINE_SIZE
  39. #define CACHE_LINE_SIZE 8
  40. #endif
  41. #ifndef DIVIDE_RATE
  42. #define DIVIDE_RATE 2
  43. #endif
  44. #ifndef SWITCH_RATIO
  45. #define SWITCH_RATIO 2
  46. #endif
  47. //The array of job_t may overflow the stack.
  48. //Instead, use malloc to alloc job_t.
  49. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
  50. #define USE_ALLOC_HEAP
  51. #endif
  52. #ifndef GEMM3M_LOCAL
  53. #if defined(NN)
  54. #define GEMM3M_LOCAL GEMM3M_NN
  55. #elif defined(NT)
  56. #define GEMM3M_LOCAL GEMM3M_NT
  57. #elif defined(NR)
  58. #define GEMM3M_LOCAL GEMM3M_NR
  59. #elif defined(NC)
  60. #define GEMM3M_LOCAL GEMM3M_NC
  61. #elif defined(TN)
  62. #define GEMM3M_LOCAL GEMM3M_TN
  63. #elif defined(TT)
  64. #define GEMM3M_LOCAL GEMM3M_TT
  65. #elif defined(TR)
  66. #define GEMM3M_LOCAL GEMM3M_TR
  67. #elif defined(TC)
  68. #define GEMM3M_LOCAL GEMM3M_TC
  69. #elif defined(RN)
  70. #define GEMM3M_LOCAL GEMM3M_RN
  71. #elif defined(RT)
  72. #define GEMM3M_LOCAL GEMM3M_RT
  73. #elif defined(RR)
  74. #define GEMM3M_LOCAL GEMM3M_RR
  75. #elif defined(RC)
  76. #define GEMM3M_LOCAL GEMM3M_RC
  77. #elif defined(CN)
  78. #define GEMM3M_LOCAL GEMM3M_CN
  79. #elif defined(CT)
  80. #define GEMM3M_LOCAL GEMM3M_CT
  81. #elif defined(CR)
  82. #define GEMM3M_LOCAL GEMM3M_CR
  83. #elif defined(CC)
  84. #define GEMM3M_LOCAL GEMM3M_CC
  85. #endif
  86. #endif
  87. typedef struct {
  88. volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
  89. } job_t;
  90. #ifndef BETA_OPERATION
  91. #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
  92. GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
  93. BETA[0], BETA[1], NULL, 0, NULL, 0, \
  94. (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
  95. #endif
  96. #ifndef ICOPYB_OPERATION
  97. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  98. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  99. #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  100. GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  101. #else
  102. #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  103. GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  104. #endif
  105. #endif
  106. #ifndef ICOPYR_OPERATION
  107. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  108. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  109. #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  110. GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  111. #else
  112. #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  113. GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  114. #endif
  115. #endif
  116. #ifndef ICOPYI_OPERATION
  117. #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
  118. defined(RN) || defined(RT) || defined(RC) || defined(RR)
  119. #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  120. GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
  121. #else
  122. #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \
  123. GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
  124. #endif
  125. #endif
  126. #ifndef OCOPYB_OPERATION
  127. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  128. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  129. #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  130. GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  131. #else
  132. #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  133. GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  134. #endif
  135. #endif
  136. #ifndef OCOPYR_OPERATION
  137. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  138. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  139. #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  140. GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  141. #else
  142. #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  143. GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  144. #endif
  145. #endif
  146. #ifndef OCOPYI_OPERATION
  147. #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
  148. defined(NR) || defined(TR) || defined(CR) || defined(RR)
  149. #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  150. GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  151. #else
  152. #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \
  153. GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER);
  154. #endif
  155. #endif
  156. #ifndef KERNEL_FUNC
  157. #define KERNEL_FUNC GEMM3M_KERNEL
  158. #endif
  159. #ifndef KERNEL_OPERATION
  160. #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \
  161. KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC)
  162. #endif
  163. #ifndef A
  164. #define A args -> a
  165. #endif
  166. #ifndef LDA
  167. #define LDA args -> lda
  168. #endif
  169. #ifndef B
  170. #define B args -> b
  171. #endif
  172. #ifndef LDB
  173. #define LDB args -> ldb
  174. #endif
  175. #ifndef C
  176. #define C args -> c
  177. #endif
  178. #ifndef LDC
  179. #define LDC args -> ldc
  180. #endif
  181. #ifndef M
  182. #define M args -> m
  183. #endif
  184. #ifndef N
  185. #define N args -> n
  186. #endif
  187. #ifndef K
  188. #define K args -> k
  189. #endif
  190. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  191. #define ALPHA1 ONE
  192. #define ALPHA2 ONE
  193. #define ALPHA5 ZERO
  194. #define ALPHA6 ONE
  195. #define ALPHA7 ONE
  196. #define ALPHA8 ZERO
  197. #define ALPHA11 ONE
  198. #define ALPHA12 -ONE
  199. #define ALPHA13 ZERO
  200. #define ALPHA14 ONE
  201. #define ALPHA17 -ONE
  202. #define ALPHA18 -ONE
  203. #endif
  204. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  205. #define ALPHA1 ONE
  206. #define ALPHA2 ONE
  207. #define ALPHA5 ONE
  208. #define ALPHA6 ZERO
  209. #define ALPHA7 ZERO
  210. #define ALPHA8 ONE
  211. #define ALPHA11 -ONE
  212. #define ALPHA12 -ONE
  213. #define ALPHA13 ONE
  214. #define ALPHA14 ZERO
  215. #define ALPHA17 -ONE
  216. #define ALPHA18 ONE
  217. #endif
  218. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  219. #define ALPHA1 ONE
  220. #define ALPHA2 ONE
  221. #define ALPHA5 ONE
  222. #define ALPHA6 ZERO
  223. #define ALPHA7 ZERO
  224. #define ALPHA8 ONE
  225. #define ALPHA11 -ONE
  226. #define ALPHA12 ONE
  227. #define ALPHA13 ONE
  228. #define ALPHA14 ZERO
  229. #define ALPHA17 -ONE
  230. #define ALPHA18 -ONE
  231. #endif
  232. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  233. #define ALPHA1 ONE
  234. #define ALPHA2 ONE
  235. #define ALPHA5 ZERO
  236. #define ALPHA6 -ONE
  237. #define ALPHA7 ONE
  238. #define ALPHA8 ZERO
  239. #define ALPHA11 ONE
  240. #define ALPHA12 ONE
  241. #define ALPHA13 ZERO
  242. #define ALPHA14 ONE
  243. #define ALPHA17 -ONE
  244. #define ALPHA18 ONE
  245. #endif
  246. #ifdef TIMING
  247. #define START_RPCC() rpcc_counter = rpcc()
  248. #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter
  249. #else
  250. #define START_RPCC()
  251. #define STOP_RPCC(COUNTER)
  252. #endif
  253. static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  254. BLASLONG k, lda, ldb, ldc;
  255. BLASLONG m_from, m_to, n_from, n_to, N_from, N_to;
  256. FLOAT *alpha, *beta;
  257. FLOAT *a, *b, *c;
  258. job_t *job = (job_t *)args -> common;
  259. BLASLONG xxx, bufferside;
  260. FLOAT *buffer[DIVIDE_RATE];
  261. BLASLONG ls, min_l, jjs, min_jj;
  262. BLASLONG is, min_i, div_n;
  263. BLASLONG i, current;
  264. #ifdef TIMING
  265. BLASLONG rpcc_counter;
  266. BLASLONG copy_A = 0;
  267. BLASLONG copy_B = 0;
  268. BLASLONG kernel = 0;
  269. BLASLONG waiting1 = 0;
  270. BLASLONG waiting2 = 0;
  271. BLASLONG waiting3 = 0;
  272. BLASLONG waiting6[MAX_CPU_NUMBER];
  273. BLASLONG ops = 0;
  274. for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
  275. #endif
  276. k = K;
  277. a = (FLOAT *)A;
  278. b = (FLOAT *)B;
  279. c = (FLOAT *)C;
  280. lda = LDA;
  281. ldb = LDB;
  282. ldc = LDC;
  283. alpha = (FLOAT *)args -> alpha;
  284. beta = (FLOAT *)args -> beta;
  285. m_from = 0;
  286. m_to = M;
  287. if (range_m) {
  288. m_from = range_m[0];
  289. m_to = range_m[1];
  290. }
  291. n_from = 0;
  292. n_to = N;
  293. N_from = 0;
  294. N_to = N;
  295. if (range_n) {
  296. n_from = range_n[mypos + 0];
  297. n_to = range_n[mypos + 1];
  298. N_from = range_n[0];
  299. N_to = range_n[args -> nthreads];
  300. }
  301. if (beta) {
  302. if ((beta[0] != ONE) || (beta[1] != ZERO))
  303. BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc);
  304. }
  305. if ((k == 0) || (alpha == NULL)) return 0;
  306. if ((alpha[0] == ZERO) && (alpha[1] == ZERO)) return 0;
  307. #if 0
  308. fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
  309. mypos, m_from, m_to, n_from, n_to, N_from, N_to);
  310. #endif
  311. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  312. buffer[0] = sb;
  313. for (i = 1; i < DIVIDE_RATE; i++) {
  314. buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1));
  315. }
  316. for(ls = 0; ls < k; ls += min_l){
  317. min_l = k - ls;
  318. if (min_l >= GEMM3M_Q * 2) {
  319. min_l = GEMM3M_Q;
  320. } else {
  321. if (min_l > GEMM3M_Q) {
  322. min_l = (min_l + 1) / 2;
  323. }
  324. }
  325. min_i = m_to - m_from;
  326. if (min_i >= GEMM3M_P * 2) {
  327. min_i = GEMM3M_P;
  328. } else {
  329. if (min_i > GEMM3M_P) {
  330. min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
  331. }
  332. }
  333. START_RPCC();
  334. ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  335. STOP_RPCC(copy_A);
  336. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  337. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  338. START_RPCC();
  339. /* Make sure if no one is using another buffer */
  340. for (i = 0; i < args -> nthreads; i++)
  341. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
  342. STOP_RPCC(waiting1);
  343. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  344. min_jj = MIN(n_to, xxx + div_n) - jjs;
  345. if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
  346. START_RPCC();
  347. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
  348. OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  349. #else
  350. OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  351. #endif
  352. STOP_RPCC(copy_B);
  353. START_RPCC();
  354. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
  355. sa, buffer[bufferside] + min_l * (jjs - xxx),
  356. c, ldc, m_from, jjs);
  357. STOP_RPCC(kernel);
  358. #ifdef TIMING
  359. ops += 2 * min_i * min_jj * min_l;
  360. #endif
  361. }
  362. for (i = 0; i < args -> nthreads; i++)
  363. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  364. }
  365. current = mypos;
  366. do {
  367. current ++;
  368. if (current >= args -> nthreads) current = 0;
  369. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  370. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  371. if (current != mypos) {
  372. START_RPCC();
  373. /* thread has to wait */
  374. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
  375. STOP_RPCC(waiting2);
  376. START_RPCC();
  377. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
  378. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  379. c, ldc, m_from, xxx);
  380. STOP_RPCC(kernel);
  381. #ifdef TIMING
  382. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  383. #endif
  384. }
  385. if (m_to - m_from == min_i) {
  386. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  387. }
  388. }
  389. } while (current != mypos);
  390. for(is = m_from + min_i; is < m_to; is += min_i){
  391. min_i = m_to - is;
  392. if (min_i >= GEMM3M_P * 2) {
  393. min_i = GEMM3M_P;
  394. } else
  395. if (min_i > GEMM3M_P) {
  396. min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
  397. }
  398. START_RPCC();
  399. ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  400. STOP_RPCC(copy_A);
  401. current = mypos;
  402. do {
  403. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  404. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  405. START_RPCC();
  406. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
  407. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  408. c, ldc, is, xxx);
  409. STOP_RPCC(kernel);
  410. #ifdef TIMING
  411. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  412. #endif
  413. if (is + min_i >= m_to) {
  414. /* Thread doesn't need this buffer any more */
  415. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  416. }
  417. }
  418. current ++;
  419. if (current >= args -> nthreads) current = 0;
  420. } while (current != mypos);
  421. } /* end of is */
  422. START_RPCC();
  423. ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  424. STOP_RPCC(copy_A);
  425. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  426. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  427. START_RPCC();
  428. /* Make sure if no one is using another buffer */
  429. for (i = 0; i < args -> nthreads; i++)
  430. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
  431. STOP_RPCC(waiting1);
  432. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  433. min_jj = MIN(n_to, xxx + div_n) - jjs;
  434. if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
  435. START_RPCC();
  436. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  437. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  438. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  439. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  440. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  441. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  442. #else
  443. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  444. #endif
  445. STOP_RPCC(copy_B);
  446. START_RPCC();
  447. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
  448. sa, buffer[bufferside] + min_l * (jjs - xxx),
  449. c, ldc, m_from, jjs);
  450. STOP_RPCC(kernel);
  451. #ifdef TIMING
  452. ops += 2 * min_i * min_jj * min_l;
  453. #endif
  454. }
  455. for (i = 0; i < args -> nthreads; i++)
  456. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  457. }
  458. current = mypos;
  459. do {
  460. current ++;
  461. if (current >= args -> nthreads) current = 0;
  462. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  463. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  464. if (current != mypos) {
  465. START_RPCC();
  466. /* thread has to wait */
  467. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
  468. STOP_RPCC(waiting2);
  469. START_RPCC();
  470. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
  471. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  472. c, ldc, m_from, xxx);
  473. STOP_RPCC(kernel);
  474. #ifdef TIMING
  475. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  476. #endif
  477. }
  478. if (m_to - m_from == min_i) {
  479. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  480. }
  481. }
  482. } while (current != mypos);
  483. for(is = m_from + min_i; is < m_to; is += min_i){
  484. min_i = m_to - is;
  485. if (min_i >= GEMM3M_P * 2) {
  486. min_i = GEMM3M_P;
  487. } else
  488. if (min_i > GEMM3M_P) {
  489. min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
  490. }
  491. START_RPCC();
  492. ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  493. STOP_RPCC(copy_A);
  494. current = mypos;
  495. do {
  496. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  497. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  498. START_RPCC();
  499. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
  500. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  501. c, ldc, is, xxx);
  502. STOP_RPCC(kernel);
  503. #ifdef TIMING
  504. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  505. #endif
  506. if (is + min_i >= m_to) {
  507. /* Thread doesn't need this buffer any more */
  508. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  509. }
  510. }
  511. current ++;
  512. if (current >= args -> nthreads) current = 0;
  513. } while (current != mypos);
  514. } /* end of is */
  515. START_RPCC();
  516. ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
  517. STOP_RPCC(copy_A);
  518. div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  519. for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
  520. START_RPCC();
  521. /* Make sure if no one is using another buffer */
  522. for (i = 0; i < args -> nthreads; i++)
  523. while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
  524. STOP_RPCC(waiting1);
  525. for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
  526. min_jj = MIN(n_to, xxx + div_n) - jjs;
  527. if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
  528. START_RPCC();
  529. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  530. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  531. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  532. OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  533. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  534. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  535. #else
  536. OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
  537. #endif
  538. STOP_RPCC(copy_B);
  539. START_RPCC();
  540. KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
  541. sa, buffer[bufferside] + min_l * (jjs - xxx),
  542. c, ldc, m_from, jjs);
  543. STOP_RPCC(kernel);
  544. #ifdef TIMING
  545. ops += 2 * min_i * min_jj * min_l;
  546. #endif
  547. }
  548. for (i = 0; i < args -> nthreads; i++)
  549. job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
  550. }
  551. current = mypos;
  552. do {
  553. current ++;
  554. if (current >= args -> nthreads) current = 0;
  555. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  556. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  557. if (current != mypos) {
  558. START_RPCC();
  559. /* thread has to wait */
  560. while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
  561. STOP_RPCC(waiting2);
  562. START_RPCC();
  563. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
  564. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  565. c, ldc, m_from, xxx);
  566. STOP_RPCC(kernel);
  567. #ifdef TIMING
  568. ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
  569. #endif
  570. }
  571. if (m_to - m_from == min_i) {
  572. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  573. }
  574. }
  575. } while (current != mypos);
  576. for(is = m_from + min_i; is < m_to; is += min_i){
  577. min_i = m_to - is;
  578. if (min_i >= GEMM3M_P * 2) {
  579. min_i = GEMM3M_P;
  580. } else
  581. if (min_i > GEMM3M_P) {
  582. min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
  583. }
  584. START_RPCC();
  585. ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
  586. STOP_RPCC(copy_A);
  587. current = mypos;
  588. do {
  589. div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
  590. for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
  591. START_RPCC();
  592. KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
  593. sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
  594. c, ldc, is, xxx);
  595. STOP_RPCC(kernel);
  596. #ifdef TIMING
  597. ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
  598. #endif
  599. if (is + min_i >= m_to) {
  600. /* Thread doesn't need this buffer any more */
  601. job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
  602. }
  603. }
  604. current ++;
  605. if (current >= args -> nthreads) current = 0;
  606. } while (current != mypos);
  607. } /* end of is */
  608. }
  609. START_RPCC();
  610. for (i = 0; i < args -> nthreads; i++) {
  611. for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
  612. while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
  613. }
  614. }
  615. STOP_RPCC(waiting3);
  616. #ifdef TIMING
  617. BLASLONG waiting = waiting1 + waiting2 + waiting3;
  618. BLASLONG total = copy_A + copy_B + kernel + waiting;
  619. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait : %6.2f Kernel : %6.2f\n",
  620. mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
  621. (double)waiting /(double)total * 100.,
  622. (double)ops/(double)kernel / 2. * 100.);
  623. fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n",
  624. mypos, copy_A, copy_B, waiting);
  625. #if 0
  626. fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
  627. mypos,
  628. (double)waiting1/(double)waiting * 100.,
  629. (double)waiting2/(double)waiting * 100.,
  630. (double)waiting3/(double)waiting * 100.);
  631. #endif
  632. fprintf(stderr, "\n");
  633. #endif
  634. return 0;
  635. }
  636. static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
  637. *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  638. blas_arg_t newarg;
  639. blas_queue_t queue[MAX_CPU_NUMBER];
  640. BLASLONG range_M[MAX_CPU_NUMBER + 1];
  641. BLASLONG range_N[MAX_CPU_NUMBER + 1];
  642. #ifndef USE_ALLOC_HEAP
  643. job_t job[MAX_CPU_NUMBER];
  644. #else
  645. job_t * job = NULL;
  646. #endif
  647. BLASLONG num_cpu_m, num_cpu_n;
  648. BLASLONG nthreads = args -> nthreads;
  649. BLASLONG width, i, j, k, js;
  650. BLASLONG m, n, n_from, n_to;
  651. int mode;
  652. #ifdef XDOUBLE
  653. mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
  654. #elif defined(DOUBLE)
  655. mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
  656. #else
  657. mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
  658. #endif
  659. newarg.m = args -> m;
  660. newarg.n = args -> n;
  661. newarg.k = args -> k;
  662. newarg.a = args -> a;
  663. newarg.b = args -> b;
  664. newarg.c = args -> c;
  665. newarg.lda = args -> lda;
  666. newarg.ldb = args -> ldb;
  667. newarg.ldc = args -> ldc;
  668. newarg.alpha = args -> alpha;
  669. newarg.beta = args -> beta;
  670. newarg.nthreads = args -> nthreads;
  671. #ifdef USE_ALLOC_HEAP
  672. job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  673. if(job==NULL){
  674. fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
  675. exit(1);
  676. }
  677. #endif
  678. newarg.common = (void *)job;
  679. if (!range_m) {
  680. range_M[0] = 0;
  681. m = args -> m;
  682. } else {
  683. range_M[0] = range_m[0];
  684. m = range_m[1] - range_m[0];
  685. }
  686. num_cpu_m = 0;
  687. while (m > 0){
  688. width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m);
  689. m -= width;
  690. if (m < 0) width = width + m;
  691. range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width;
  692. num_cpu_m ++;
  693. }
  694. for (i = 0; i < num_cpu_m; i++) {
  695. queue[i].mode = mode;
  696. queue[i].routine = inner_thread;
  697. queue[i].args = &newarg;
  698. queue[i].range_m = &range_M[i];
  699. queue[i].range_n = &range_N[0];
  700. queue[i].sa = NULL;
  701. queue[i].sb = NULL;
  702. queue[i].next = &queue[i + 1];
  703. }
  704. queue[0].sa = sa;
  705. queue[0].sb = sb;
  706. if (!range_n) {
  707. n_from = 0;
  708. n_to = args -> n;
  709. } else {
  710. n_from = range_n[0];
  711. n_to = range_n[1];
  712. }
  713. for(js = n_from; js < n_to; js += GEMM_R * nthreads){
  714. n = n_to - js;
  715. if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
  716. range_N[0] = js;
  717. num_cpu_n = 0;
  718. while (n > 0){
  719. width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n);
  720. n -= width;
  721. if (n < 0) width = width + n;
  722. range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;
  723. num_cpu_n ++;
  724. }
  725. for (j = 0; j < num_cpu_m; j++) {
  726. for (i = 0; i < num_cpu_m; i++) {
  727. for (k = 0; k < DIVIDE_RATE; k++) {
  728. job[j].working[i][CACHE_LINE_SIZE * k] = 0;
  729. }
  730. }
  731. }
  732. queue[num_cpu_m - 1].next = NULL;
  733. exec_blas(num_cpu_m, queue);
  734. }
  735. #ifdef USE_ALLOC_HEAP
  736. free(job);
  737. #endif
  738. return 0;
  739. }
  740. int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
  741. BLASLONG m = args -> m;
  742. BLASLONG n = args -> n;
  743. BLASLONG nthreads = args -> nthreads;
  744. BLASLONG divN, divT;
  745. int mode;
  746. if (range_m) {
  747. BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
  748. BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
  749. m = m_to - m_from;
  750. }
  751. if (range_n) {
  752. BLASLONG n_from = *(((BLASLONG *)range_n) + 0);
  753. BLASLONG n_to = *(((BLASLONG *)range_n) + 1);
  754. n = n_to - n_from;
  755. }
  756. if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
  757. GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
  758. return 0;
  759. }
  760. divT = nthreads;
  761. divN = 1;
  762. while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
  763. do {
  764. divT --;
  765. divN = 1;
  766. while (divT * divN < nthreads) divN ++;
  767. } while ((divT * divN != nthreads) && (divT > 1));
  768. }
  769. args -> nthreads = divT;
  770. if (divN == 1){
  771. gemm_driver(args, range_m, range_n, sa, sb, 0);
  772. } else {
  773. #ifdef XDOUBLE
  774. mode = BLAS_XDOUBLE | BLAS_COMPLEX;
  775. #elif defined(DOUBLE)
  776. mode = BLAS_DOUBLE | BLAS_COMPLEX;
  777. #else
  778. mode = BLAS_SINGLE | BLAS_COMPLEX;
  779. #endif
  780. #if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \
  781. defined(CN) || defined(CT) || defined(CR) || defined(CC)
  782. mode |= (BLAS_TRANSA_T);
  783. #endif
  784. #if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \
  785. defined(NC) || defined(TC) || defined(RC) || defined(CC)
  786. mode |= (BLAS_TRANSB_T);
  787. #endif
  788. gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
  789. }
  790. return 0;
  791. }