| @@ -105,6 +105,14 @@ typedef struct { | |||||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | ||||
| } job_t; | } job_t; | ||||
| #ifdef HAVE_C11 | |||||
| #define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED) | |||||
| #define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) | |||||
| #else | |||||
| #define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p)) | |||||
| #define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v) | |||||
| #endif | |||||
| #ifndef KERNEL_OPERATION | #ifndef KERNEL_OPERATION | ||||
| #ifndef COMPLEX | #ifndef COMPLEX | ||||
| @@ -233,14 +241,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| } | } | ||||
| #ifndef LOWER | #ifndef LOWER | ||||
| MB; | |||||
| for (i = 0; i <= mypos; i++) | for (i = 0; i <= mypos; i++) | ||||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||||
| atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]); | |||||
| // job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||||
| #else | #else | ||||
| MB | |||||
| for (i = mypos; i < args -> nthreads; i++) | for (i = mypos; i < args -> nthreads; i++) | ||||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||||
| atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]); | |||||
| // job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||||
| #endif | #endif | ||||
| WMB; | |||||
| // WMB; | |||||
| } | } | ||||
| min_i = m_to - m_from; | min_i = m_to - m_from; | ||||
| @@ -271,14 +283,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | ||||
| /* thread has to wait */ | /* thread has to wait */ | ||||
| if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||||
| if (current != mypos) | |||||
| do { | |||||
| jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]); | |||||
| } while (jw == 0); | |||||
| MB; | |||||
| //while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||||
| KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, | KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, | ||||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | ||||
| c, lda, m_from, xxx); | c, lda, m_from, xxx); | ||||
| if (m_from + min_i >= m_to) { | if (m_from + min_i >= m_to) { | ||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0); | |||||
| // job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| WMB; | WMB; | ||||
| } | } | ||||
| } | } | ||||
| @@ -323,7 +342,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| c, lda, is, xxx); | c, lda, is, xxx); | ||||
| if (is + min_i >= m_to) { | if (is + min_i >= m_to) { | ||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0); | |||||
| // job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| WMB; | WMB; | ||||
| } | } | ||||
| } | } | ||||
| @@ -337,9 +357,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| for (i = 0; i < args -> nthreads; i++) { | for (i = 0; i < args -> nthreads; i++) { | ||||
| if (i != mypos) { | if (i != mypos) { | ||||
| for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { | |||||
| for (xxx = 0; xxx < DIVIDE_RATE; xxx++) | |||||
| #if 1 | |||||
| { | |||||
| do { | |||||
| jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * xxx]); | |||||
| } while (jw); | |||||
| MB; | |||||
| } | |||||
| #else | |||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; | while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; | ||||
| } | |||||
| #endif | |||||
| // } | |||||
| } | } | ||||
| } | } | ||||