|
|
@@ -105,6 +105,14 @@ typedef struct { |
|
|
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; |
|
|
|
} job_t; |
|
|
|
|
|
|
|
#ifdef HAVE_C11 |
|
|
|
#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED) |
|
|
|
#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) |
|
|
|
#else |
|
|
|
#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p)) |
|
|
|
#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v) |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
#ifndef KERNEL_OPERATION |
|
|
|
#ifndef COMPLEX |
|
|
@@ -233,14 +241,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
} |
|
|
|
|
|
|
|
#ifndef LOWER |
|
|
|
MB; |
|
|
|
for (i = 0; i <= mypos; i++) |
|
|
|
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; |
|
|
|
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]); |
|
|
|
// job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; |
|
|
|
#else |
|
|
|
MB |
|
|
|
for (i = mypos; i < args -> nthreads; i++) |
|
|
|
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; |
|
|
|
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]); |
|
|
|
// job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; |
|
|
|
#endif |
|
|
|
|
|
|
|
WMB; |
|
|
|
// WMB; |
|
|
|
} |
|
|
|
|
|
|
|
min_i = m_to - m_from; |
|
|
@@ -271,14 +283,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { |
|
|
|
|
|
|
|
/* thread has to wait */ |
|
|
|
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; |
|
|
|
if (current != mypos) |
|
|
|
do { |
|
|
|
jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]); |
|
|
|
} while (jw == 0); |
|
|
|
MB; |
|
|
|
|
|
|
|
//while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; |
|
|
|
|
|
|
|
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, |
|
|
|
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], |
|
|
|
c, lda, m_from, xxx); |
|
|
|
|
|
|
|
if (m_from + min_i >= m_to) { |
|
|
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; |
|
|
|
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0); |
|
|
|
// job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; |
|
|
|
WMB; |
|
|
|
} |
|
|
|
} |
|
|
@@ -323,7 +342,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
c, lda, is, xxx); |
|
|
|
|
|
|
|
if (is + min_i >= m_to) { |
|
|
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; |
|
|
|
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0); |
|
|
|
// job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; |
|
|
|
WMB; |
|
|
|
} |
|
|
|
} |
|
|
@@ -337,9 +357,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
|
|
|
|
for (i = 0; i < args -> nthreads; i++) { |
|
|
|
if (i != mypos) { |
|
|
|
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { |
|
|
|
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) |
|
|
|
#if 1 |
|
|
|
{ |
|
|
|
do { |
|
|
|
jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * xxx]); |
|
|
|
} while (jw); |
|
|
|
MB; |
|
|
|
} |
|
|
|
#else |
|
|
|
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; |
|
|
|
} |
|
|
|
#endif |
|
|
|
// } |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|