Browse Source

Merge pull request #2 from martin-frbg/develop

merge develop
tags/v0.3.1
Martin Kroeker GitHub 7 years ago
parent
commit
e6d93f20f1
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 419 additions and 362 deletions
  1. +21
    -7
      Makefile
  2. +13
    -2
      Makefile.rule
  3. +7
    -0
      Makefile.x86_64
  4. +5
    -0
      cblas.h
  5. +9
    -8
      common_stackalloc.h
  6. +7
    -2
      common_x86_64.h
  7. +17
    -0
      cpuid_x86.c
  8. +7
    -1
      ctest/Makefile
  9. +18
    -22
      driver/level3/level3_thread.c
  10. +5
    -1
      driver/others/blas_server_omp.c
  11. +272
    -298
      driver/others/memory.c
  12. +16
    -4
      interface/Makefile
  13. +2
    -2
      param.h
  14. +7
    -2
      test/Makefile
  15. +0
    -2
      utest/CMakeLists.txt
  16. +0
    -2
      utest/Makefile
  17. +13
    -9
      utest/test_fork.c

+ 21
- 7
Makefile View File

@@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1)
RELA = re_lapack
endif

ifeq ($(NO_FORTRAN), 1)
define NOFORTRAN
1
endef
define NO_LAPACK
1
endef
export NOFORTRAN
export NO_LAPACK
endif

LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))

SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
@@ -47,7 +58,7 @@ endif
endif

@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
endif
ifneq ($(OSNAME), AIX)
@@ -108,7 +119,7 @@ endif
endif

tests :
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
@@ -210,7 +221,7 @@ netlib :

else
netlib : lapack_prebuild
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
endif
@@ -231,7 +242,10 @@ prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof

lapack_prebuild :
ifndef NOFORTRAN
$(info filter value of NOFORTRAN is:)
$(info x$(filter-out $(NOFORTRAN), 1 2)x)

ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -274,21 +288,21 @@ endif
endif

large.tgz :
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz;
fi
endif

timing.tgz :
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/timing.tgz;
fi
endif

lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING


+ 13
- 2
Makefile.rule View File

@@ -60,6 +60,14 @@ VERSION = 0.3.1.dev
# This flag is always set for POWER8. Don't modify the flag
# USE_OPENMP = 1

# The OpenMP scheduler to use - by default this is "static" and you
# will normally not want to change this unless you know that your main
# workload will involve tasks that have highly unbalanced running times
# for individual threads. Changing away from "static" may also adversely
# affect memory access locality in NUMA systems. Setting to "runtime" will
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic

# You can define maximum number of threads. Basically it should be
# less than actual number of cores. If you don't specify one, it's
# automatically detected by the the script.
@@ -156,8 +164,11 @@ NO_AFFINITY = 1
# CONSISTENT_FPCSR = 1

# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 4.
# with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4

# If you need santy check by comparing reference BLAS. It'll be very


+ 7
- 0
Makefile.x86_64 View File

@@ -8,6 +8,13 @@ endif
endif
endif

ifeq ($(CORE), SKYLAKEX)
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
endif
endif

ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64
endif


+ 5
- 0
cblas.h View File

@@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);


+ 9
- 8
common_stackalloc.h View File

@@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
stack_alloc_size = 0; \
STACK_ALLOC_PROTECT_SET \
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \
if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
STACK_ALLOC_PROTECT_SET \
/* Avoid declaring an array of length 0 */ \
TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
__attribute__((aligned(0x20))); \
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
#else
//Original OpenBLAS/GotoBLAS codes.


+ 7
- 2
common_x86_64.h View File

@@ -60,8 +60,13 @@
#endif
*/

#define MB
#define WMB
#ifdef __GNUC__
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
#else
#define MB do {} while (0)
#define WMB do {} while (0)
#endif

static void __inline blas_lock(volatile BLASULONG *address){



+ 17
- 0
cpuid_x86.c View File

@@ -1339,6 +1339,23 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 6:
switch (model) {
case 6: // Cannon Lake
#ifndef NO_AVX512
return CPUTYPE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
#endif
}
break;
case 9:
case 8:
switch (model) {


+ 7
- 1
ctest/Makefile View File

@@ -102,7 +102,13 @@ clean ::
rm -f x*

FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB =
ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
endif
endif
endif

# Single real
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)


+ 18
- 22
driver/level3/level3_thread.c View File

@@ -91,11 +91,7 @@
#endif

typedef struct {
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

@@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {

/* Make sure if no one is using workspace */
START_RPCC();
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
STOP_RPCC(waiting1);

#if defined(FUSED_GEMM) && !defined(TIMING)

/* Fused operation to copy region of B into workspace and apply kernel */
@@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
#endif

/* Set flag so other threads can access local region of B */
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
/* Make sure if no one is using workspace */
START_RPCC();
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
/* Set flag so other threads can access local region of B */
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
WMB;
WMB;
}
}

/* Get regions of B from other threads and apply kernel */
@@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,

/* Wait until other region of B is initialized */
START_RPCC();
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);

/* Apply kernel with local region of A and part of other region of B */
@@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,

/* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
} while (current != mypos);

/* Iterate through steps of m
/* Iterate through steps of m
* Note: First step has already been finished */
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
@@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, js);
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
#endif
/* Clear synchronization flag if this thread is done with region of B */
if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
@@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
}
}
STOP_RPCC(waiting3);
@@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
}

/* Clear synchronization flags */
for (i = 0; i < MAX_CPU_NUMBER; i++) {
for (j = 0; j < MAX_CPU_NUMBER; j++) {
for (i = 0; i < nthreads; i++) {
for (j = 0; j < nthreads; j++) {
for (k = 0; k < DIVIDE_RATE; k++) {
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
}


+ 5
- 1
driver/others/blas_server_omp.c View File

@@ -48,6 +48,10 @@

#else

#ifndef OMP_SCHED
#define OMP_SCHED static
#endif

int blas_server_avail = 0;

static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
@@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
break;
}

#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(OMP_SCHED)
for (i = 0; i < num; i ++) {

#ifndef USE_SIMPLE_THREADED_LEVEL3


+ 272
- 298
driver/others/memory.c
File diff suppressed because it is too large
View File


+ 16
- 4
interface/Makefile View File

@@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)

CSBLAS1OBJS = \
cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
@@ -277,7 +277,7 @@ CSBLAS3OBJS = \
cblas_sgeadd.$(SUFFIX)

CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
@@ -294,7 +294,7 @@ CDBLAS3OBJS += \
cblas_dgeadd.$(SUFFIX)

CCBLAS1OBJS = \
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
cblas_ccopy.$(SUFFIX) \
cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
@@ -320,7 +320,7 @@ CCBLAS3OBJS = \


CZBLAS1OBJS = \
cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
cblas_zcopy.$(SUFFIX) \
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
@@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c
cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)

cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)

cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)

cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)

cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)

cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)



+ 2
- 2
param.h View File

@@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#define SYMV_P 8

#define SWITCH_RATIO 4
#define SWITCH_RATIO 32

#ifdef ARCH_X86

@@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#define SYMV_P 8

#define SWITCH_RATIO 4
#define SWITCH_RATIO 32

#ifdef ARCH_X86



+ 7
- 2
test/Makefile View File

@@ -122,8 +122,13 @@ endif


FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB =

ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
endif
endif
endif

sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME)
$(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)


+ 0
- 2
utest/CMakeLists.txt View File

@@ -25,7 +25,6 @@ endif ()

# known to hang with the native Windows and Android threads
# FIXME needs checking if this works on any of the other platforms
if (NOT NO_CBLAS)
if (NOT USE_OPENMP)
if (OS_CYGWIN_NT OR OS_LINUX)
set(OpenBLAS_utest_src
@@ -34,7 +33,6 @@ set(OpenBLAS_utest_src
)
endif()
endif()
endif()

if (NOT NO_LAPACK)
set(OpenBLAS_utest_src


+ 0
- 2
utest/Makefile View File

@@ -17,13 +17,11 @@ endif

#this does not work with OpenMP nor with native Windows or Android threads
# FIXME TBD if this works on OSX, SunOS, POWER and zarch
ifneq ($(NO_CBLAS), 1)
ifndef USE_OPENMP
ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
OBJS += test_fork.o
endif
endif
endif

all : run_test



+ 13
- 9
utest/test_fork.c View File

@@ -13,9 +13,9 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@@ -48,11 +48,13 @@ void* xmalloc(size_t n)
}
}

void check_dgemm(double *a, double *b, double *result, double *expected, int n)
void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
{
char trans1 = 'T';
char trans2 = 'N';
double zerod = 0, oned = 1;
int i;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n,
1.0, a, n, b, n, 0.0, result, n);
BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n);
for(i = 0; i < n * n; ++i) {
ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS);
}
@@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n)

CTEST(fork, safety)
{
int n = 1000;
blasint n = 1000;
int i;

double *a, *b, *c, *d;
@@ -84,8 +86,10 @@ CTEST(fork, safety)

// Compute a DGEMM product in the parent process prior to forking to
// ensure that the OpenBLAS thread pool is initialized.
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n,
1.0, a, n, b, n, 0.0, c, n);
char trans1 = 'T';
char trans2 = 'N';
double zerod = 0, oned = 1;
BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n);

fork_pid = fork();
if (fork_pid == -1) {


Loading…
Cancel
Save