| @@ -351,8 +351,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| /* Make sure if no one is using workspace */ | /* Make sure if no one is using workspace */ | ||||
| START_RPCC(); | START_RPCC(); | ||||
| for (i = 0; i < args -> nthreads; i++) | for (i = 0; i < args -> nthreads; i++) | ||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; | |||||
| STOP_RPCC(waiting1); | STOP_RPCC(waiting1); | ||||
| MB; | |||||
| #if defined(FUSED_GEMM) && !defined(TIMING) | #if defined(FUSED_GEMM) && !defined(TIMING) | ||||
| @@ -395,10 +396,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| } | } | ||||
| #endif | #endif | ||||
| WMB; | |||||
| /* Set flag so other threads can access local region of B */ | /* Set flag so other threads can access local region of B */ | ||||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) | for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) | ||||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | ||||
| WMB; | |||||
| } | } | ||||
| /* Get regions of B from other threads and apply kernel */ | /* Get regions of B from other threads and apply kernel */ | ||||
| @@ -417,8 +418,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| /* Wait until other region of B is initialized */ | /* Wait until other region of B is initialized */ | ||||
| START_RPCC(); | START_RPCC(); | ||||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; | |||||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||||
| STOP_RPCC(waiting2); | STOP_RPCC(waiting2); | ||||
| MB; | |||||
| /* Apply kernel with local region of A and part of other region of B */ | /* Apply kernel with local region of A and part of other region of B */ | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -434,8 +436,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| /* Clear synchronization flag if this thread is done with other region of B */ | /* Clear synchronization flag if this thread is done with other region of B */ | ||||
| if (m_to - m_from == min_i) { | if (m_to - m_from == min_i) { | ||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| WMB; | WMB; | ||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| } | } | ||||
| } | } | ||||
| } while (current != mypos); | } while (current != mypos); | ||||
| @@ -477,8 +479,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| /* Clear synchronization flag if this thread is done with region of B */ | /* Clear synchronization flag if this thread is done with region of B */ | ||||
| if (is + min_i >= m_to) { | if (is + min_i >= m_to) { | ||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| WMB; | WMB; | ||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| } | } | ||||
| } | } | ||||
| @@ -497,10 +499,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| START_RPCC(); | START_RPCC(); | ||||
| for (i = 0; i < args -> nthreads; i++) { | for (i = 0; i < args -> nthreads; i++) { | ||||
| for (js = 0; js < DIVIDE_RATE; js++) { | for (js = 0; js < DIVIDE_RATE; js++) { | ||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; | |||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; | |||||
| } | } | ||||
| } | } | ||||
| STOP_RPCC(waiting3); | STOP_RPCC(waiting3); | ||||
| MB; | |||||
| #ifdef TIMING | #ifdef TIMING | ||||
| BLASLONG waiting = waiting1 + waiting2 + waiting3; | BLASLONG waiting = waiting1 + waiting2 + waiting3; | ||||
| @@ -705,7 +708,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| WMB; | |||||
| /* Execute parallel computation */ | /* Execute parallel computation */ | ||||
| exec_blas(nthreads, queue); | exec_blas(nthreads, queue); | ||||
| } | } | ||||