|
@@ -36,6 +36,13 @@ |
|
|
/* or implied, of The University of Texas at Austin. */ |
|
|
/* or implied, of The University of Texas at Austin. */ |
|
|
/*********************************************************************/ |
|
|
/*********************************************************************/ |
|
|
|
|
|
|
|
|
|
|
|
#if _STDC_VERSION__ >= 201112L |
|
|
|
|
|
#ifndef _Atomic |
|
|
|
|
|
#define _Atomic volatile |
|
|
|
|
|
#endif |
|
|
|
|
|
#include <stdatomic.h> |
|
|
|
|
|
#endif |
|
|
|
|
|
#include <stdbool.h> |
|
|
#include <stdio.h> |
|
|
#include <stdio.h> |
|
|
#include <stdlib.h> |
|
|
#include <stdlib.h> |
|
|
//#include <sys/mman.h> |
|
|
//#include <sys/mman.h> |
|
@@ -49,11 +56,16 @@ |
|
|
|
|
|
|
|
|
int blas_server_avail = 0; |
|
|
int blas_server_avail = 0; |
|
|
|
|
|
|
|
|
static void * blas_thread_buffer[MAX_CPU_NUMBER]; |
|
|
|
|
|
|
|
|
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; |
|
|
|
|
|
#if _STDC_VERSION__ >= 201112L |
|
|
|
|
|
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; |
|
|
|
|
|
#else |
|
|
|
|
|
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; |
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
void goto_set_num_threads(int num_threads) { |
|
|
void goto_set_num_threads(int num_threads) { |
|
|
|
|
|
|
|
|
int i=0; |
|
|
|
|
|
|
|
|
int i=0, j=0; |
|
|
|
|
|
|
|
|
if (num_threads < 1) num_threads = blas_num_threads; |
|
|
if (num_threads < 1) num_threads = blas_num_threads; |
|
|
|
|
|
|
|
@@ -68,15 +80,17 @@ void goto_set_num_threads(int num_threads) { |
|
|
omp_set_num_threads(blas_cpu_number); |
|
|
omp_set_num_threads(blas_cpu_number); |
|
|
|
|
|
|
|
|
//adjust buffer for each thread |
|
|
//adjust buffer for each thread |
|
|
for(i=0; i<blas_cpu_number; i++){ |
|
|
|
|
|
if(blas_thread_buffer[i]==NULL){ |
|
|
|
|
|
blas_thread_buffer[i]=blas_memory_alloc(2); |
|
|
|
|
|
|
|
|
for(i=0; i<MAX_PARALLEL_NUMBER; i++) { |
|
|
|
|
|
for(j=0; j<blas_cpu_number; j++){ |
|
|
|
|
|
if(blas_thread_buffer[i][j]==NULL){ |
|
|
|
|
|
blas_thread_buffer[i][j]=blas_memory_alloc(2); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
for(; i<MAX_CPU_NUMBER; i++){ |
|
|
|
|
|
if(blas_thread_buffer[i]!=NULL){ |
|
|
|
|
|
blas_memory_free(blas_thread_buffer[i]); |
|
|
|
|
|
blas_thread_buffer[i]=NULL; |
|
|
|
|
|
|
|
|
for(; j<MAX_CPU_NUMBER; j++){ |
|
|
|
|
|
if(blas_thread_buffer[i][j]!=NULL){ |
|
|
|
|
|
blas_memory_free(blas_thread_buffer[i][j]); |
|
|
|
|
|
blas_thread_buffer[i][j]=NULL; |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
#if defined(ARCH_MIPS64) |
|
|
#if defined(ARCH_MIPS64) |
|
@@ -92,30 +106,34 @@ void openblas_set_num_threads(int num_threads) { |
|
|
|
|
|
|
|
|
int blas_thread_init(void){ |
|
|
int blas_thread_init(void){ |
|
|
|
|
|
|
|
|
int i=0; |
|
|
|
|
|
|
|
|
int i=0, j=0; |
|
|
|
|
|
|
|
|
blas_get_cpu_number(); |
|
|
blas_get_cpu_number(); |
|
|
|
|
|
|
|
|
blas_server_avail = 1; |
|
|
blas_server_avail = 1; |
|
|
|
|
|
|
|
|
for(i=0; i<blas_num_threads; i++){ |
|
|
|
|
|
blas_thread_buffer[i]=blas_memory_alloc(2); |
|
|
|
|
|
} |
|
|
|
|
|
for(; i<MAX_CPU_NUMBER; i++){ |
|
|
|
|
|
blas_thread_buffer[i]=NULL; |
|
|
|
|
|
|
|
|
for(i=0; i<MAX_PARALLEL_NUMBER; i++) { |
|
|
|
|
|
for(j=0; j<blas_num_threads; j++){ |
|
|
|
|
|
blas_thread_buffer[i][j]=blas_memory_alloc(2); |
|
|
|
|
|
} |
|
|
|
|
|
for(; j<MAX_CPU_NUMBER; j++){ |
|
|
|
|
|
blas_thread_buffer[i][j]=NULL; |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
return 0; |
|
|
return 0; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int BLASFUNC(blas_thread_shutdown)(void){ |
|
|
int BLASFUNC(blas_thread_shutdown)(void){ |
|
|
int i=0; |
|
|
|
|
|
|
|
|
int i=0, j=0; |
|
|
blas_server_avail = 0; |
|
|
blas_server_avail = 0; |
|
|
|
|
|
|
|
|
for(i=0; i<MAX_CPU_NUMBER; i++){ |
|
|
|
|
|
if(blas_thread_buffer[i]!=NULL){ |
|
|
|
|
|
blas_memory_free(blas_thread_buffer[i]); |
|
|
|
|
|
blas_thread_buffer[i]=NULL; |
|
|
|
|
|
|
|
|
for(i=0; i<MAX_PARALLEL_NUMBER; i++) { |
|
|
|
|
|
for(j=0; j<MAX_CPU_NUMBER; j++){ |
|
|
|
|
|
if(blas_thread_buffer[i][j]!=NULL){ |
|
|
|
|
|
blas_memory_free(blas_thread_buffer[i][j]); |
|
|
|
|
|
blas_thread_buffer[i][j]=NULL; |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
@@ -206,7 +224,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
static void exec_threads(blas_queue_t *queue){ |
|
|
|
|
|
|
|
|
static void exec_threads(blas_queue_t *queue, int buf_index){ |
|
|
|
|
|
|
|
|
void *buffer, *sa, *sb; |
|
|
void *buffer, *sa, *sb; |
|
|
int pos=0, release_flag=0; |
|
|
int pos=0, release_flag=0; |
|
@@ -223,7 +241,7 @@ static void exec_threads(blas_queue_t *queue){ |
|
|
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { |
|
|
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { |
|
|
|
|
|
|
|
|
pos = omp_get_thread_num(); |
|
|
pos = omp_get_thread_num(); |
|
|
buffer = blas_thread_buffer[pos]; |
|
|
|
|
|
|
|
|
buffer = blas_thread_buffer[buf_index][pos]; |
|
|
|
|
|
|
|
|
//fallback |
|
|
//fallback |
|
|
if(buffer==NULL) { |
|
|
if(buffer==NULL) { |
|
@@ -291,7 +309,7 @@ static void exec_threads(blas_queue_t *queue){ |
|
|
|
|
|
|
|
|
int exec_blas(BLASLONG num, blas_queue_t *queue){ |
|
|
int exec_blas(BLASLONG num, blas_queue_t *queue){ |
|
|
|
|
|
|
|
|
BLASLONG i; |
|
|
|
|
|
|
|
|
BLASLONG i, buf_index; |
|
|
|
|
|
|
|
|
if ((num <= 0) || (queue == NULL)) return 0; |
|
|
if ((num <= 0) || (queue == NULL)) return 0; |
|
|
|
|
|
|
|
@@ -302,6 +320,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ |
|
|
} |
|
|
} |
|
|
#endif |
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
while(true) { |
|
|
|
|
|
for(i=0; i < MAX_PARALLEL_NUMBER; i++) { |
|
|
|
|
|
#if _STDC_VERSION__ >= 201112L |
|
|
|
|
|
_Bool inuse = false; |
|
|
|
|
|
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { |
|
|
|
|
|
#else |
|
|
|
|
|
if(blas_buffer_inuse[i] == false) { |
|
|
|
|
|
blas_buffer_inuse[i] = true; |
|
|
|
|
|
#endif |
|
|
|
|
|
buf_index = i; |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
if(i != MAX_PARALLEL_NUMBER) |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
#pragma omp parallel for schedule(static) |
|
|
#pragma omp parallel for schedule(static) |
|
|
for (i = 0; i < num; i ++) { |
|
|
for (i = 0; i < num; i ++) { |
|
|
|
|
|
|
|
@@ -309,9 +344,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ |
|
|
queue[i].position = i; |
|
|
queue[i].position = i; |
|
|
#endif |
|
|
#endif |
|
|
|
|
|
|
|
|
exec_threads(&queue[i]); |
|
|
|
|
|
|
|
|
exec_threads(&queue[i], buf_index); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#if _STDC_VERSION__ >= 201112L |
|
|
|
|
|
atomic_store(&blas_buffer_inuse[buf_index], false); |
|
|
|
|
|
#else |
|
|
|
|
|
blas_buffer_inuse[buf_index] = false; |
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
return 0; |
|
|
return 0; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|