Browse Source

Merge pull request #4586 from martin-frbg/potrf-para

use atomic acq/rel operations in potrf_parallel as in the corresponding getrf_parallel
tags/v0.3.27
Martin Kroeker GitHub 1 year ago
parent
commit
e1638ea43a
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
1 changed files with 37 additions and 8 deletions
  1. +37
    -8
      lapack/potrf/potrf_parallel.c

+ 37
- 8
lapack/potrf/potrf_parallel.c View File

@@ -105,6 +105,14 @@ typedef struct {
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;


#ifdef HAVE_C11
#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED)
#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
#else
#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p))
#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v)
#endif



#ifndef KERNEL_OPERATION #ifndef KERNEL_OPERATION
#ifndef COMPLEX #ifndef COMPLEX
@@ -233,14 +241,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
} }


#ifndef LOWER #ifndef LOWER
MB;
for (i = 0; i <= mypos; i++) for (i = 0; i <= mypos; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
// job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#else #else
MB
for (i = mypos; i < args -> nthreads; i++) for (i = mypos; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
// job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#endif #endif


WMB;
// WMB;
} }


min_i = m_to - m_from; min_i = m_to - m_from;
@@ -271,14 +283,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {


/* thread has to wait */ /* thread has to wait */
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
if (current != mypos)
do {
jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]);
} while (jw == 0);
MB;

//while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};


KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, lda, m_from, xxx); c, lda, m_from, xxx);


if (m_from + min_i >= m_to) { if (m_from + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0);
// job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB; WMB;
} }
} }
@@ -323,7 +342,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
c, lda, is, xxx); c, lda, is, xxx);


if (is + min_i >= m_to) { if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0);
// job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB; WMB;
} }
} }
@@ -337,9 +357,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,


for (i = 0; i < args -> nthreads; i++) { for (i = 0; i < args -> nthreads; i++) {
if (i != mypos) { if (i != mypos) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++)
#if 1
{
do {
jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * xxx]);
} while (jw);
MB;
}
#else
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
}
#endif
// }
} }
} }




Loading…
Cancel
Save