| @@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1) | |||
| RELA = re_lapack | |||
| endif | |||
| ifeq ($(NO_FORTRAN), 1) | |||
| define NOFORTRAN | |||
| 1 | |||
| endef | |||
| define NO_LAPACK | |||
| 1 | |||
| endef | |||
| export NOFORTRAN | |||
| export NO_LAPACK | |||
| endif | |||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | |||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | |||
| @@ -47,7 +58,7 @@ endif | |||
| endif | |||
| @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" | |||
| ifndef NOFORTRAN | |||
| ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) | |||
| @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" | |||
| endif | |||
| ifneq ($(OSNAME), AIX) | |||
| @@ -108,7 +119,7 @@ endif | |||
| endif | |||
| tests : | |||
| ifndef NOFORTRAN | |||
| ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) | |||
| touch $(LIBNAME) | |||
| ifndef NO_FBLAS | |||
| $(MAKE) -C test all | |||
| @@ -210,7 +221,7 @@ netlib : | |||
| else | |||
| netlib : lapack_prebuild | |||
| ifndef NOFORTRAN | |||
| ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | |||
| endif | |||
| @@ -231,7 +242,10 @@ prof_lapack : lapack_prebuild | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | |||
| lapack_prebuild : | |||
| ifndef NOFORTRAN | |||
| $(info filter value of NOFORTRAN is:) | |||
| $(info x$(filter-out $(NOFORTRAN), 1 2)x) | |||
| ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) | |||
| -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -274,21 +288,21 @@ endif | |||
| endif | |||
| large.tgz : | |||
| ifndef NOFORTRAN | |||
| ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) | |||
| if [ ! -a $< ]; then | |||
| -wget http://www.netlib.org/lapack/timing/large.tgz; | |||
| fi | |||
| endif | |||
| timing.tgz : | |||
| ifndef NOFORTRAN | |||
| ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) | |||
| if [ ! -a $< ]; then | |||
| -wget http://www.netlib.org/lapack/timing/timing.tgz; | |||
| fi | |||
| endif | |||
| lapack-timing : large.tgz timing.tgz | |||
| ifndef NOFORTRAN | |||
| ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) | |||
| (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | |||
| (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | |||
| $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING | |||
| @@ -60,6 +60,14 @@ VERSION = 0.3.1.dev | |||
| # This flag is always set for POWER8. Don't modify the flag | |||
| # USE_OPENMP = 1 | |||
| # The OpenMP scheduler to use - by default this is "static" and you | |||
| # will normally not want to change this unless you know that your main | |||
| # workload will involve tasks that have highly unbalanced running times | |||
| # for individual threads. Changing away from "static" may also adversely | |||
| # affect memory access locality in NUMA systems. Setting to "runtime" will | |||
| # allow you to select the scheduler from the environment variable OMP_SCHEDULE | |||
| # CCOMMON_OPT += -DOMP_SCHED=dynamic | |||
| # You can define maximum number of threads. Basically it should be | |||
| # less than actual number of cores. If you don't specify one, it's | |||
| # automatically detected by the the script. | |||
| @@ -156,8 +164,11 @@ NO_AFFINITY = 1 | |||
| # CONSISTENT_FPCSR = 1 | |||
| # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | |||
| # with single thread. You can use this flag to avoid the overhead of multi-threading | |||
| # in small matrix sizes. The default value is 4. | |||
| # with single thread. (Actually in recent versions this is a factor proportional to the | |||
| # number of floating point operations necessary for the given problem size, no longer | |||
| # an individual dimension). You can use this setting to avoid the overhead of multi- | |||
| # threading in small matrix sizes. The default value is 4, but values as high as 50 have | |||
| # been reported to be optimal for certain workloads (50 is the recommended value for Julia). | |||
| # GEMM_MULTITHREAD_THRESHOLD = 4 | |||
| # If you need santy check by comparing reference BLAS. It'll be very | |||
| @@ -8,6 +8,13 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), SKYLAKEX) | |||
| ifndef NO_AVX512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), Interix) | |||
| ARFLAGS = -m x64 | |||
| endif | |||
| @@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||
| CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| @@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * - large enough to support all architectures and kernel | |||
| * Chosing a too small SIZE will lead to a stack smashing. | |||
| */ | |||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | |||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | |||
| /* do not restore all register */ \ | |||
| volatile int stack_alloc_size = SIZE; \ | |||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ | |||
| stack_alloc_size = 0; \ | |||
| STACK_ALLOC_PROTECT_SET \ | |||
| TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ | |||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | |||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | |||
| /* do not restore all register */ \ | |||
| volatile int stack_alloc_size = SIZE; \ | |||
| if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \ | |||
| STACK_ALLOC_PROTECT_SET \ | |||
| /* Avoid declaring an array of length 0 */ \ | |||
| TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \ | |||
| __attribute__((aligned(0x20))); \ | |||
| BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); | |||
| #else | |||
| //Original OpenBLAS/GotoBLAS codes. | |||
| @@ -60,8 +60,13 @@ | |||
| #endif | |||
| */ | |||
| #define MB | |||
| #define WMB | |||
| #ifdef __GNUC__ | |||
| #define MB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||
| #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||
| #else | |||
| #define MB do {} while (0) | |||
| #define WMB do {} while (0) | |||
| #endif | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| @@ -1339,6 +1339,23 @@ int get_cpuname(void){ | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 6: | |||
| switch (model) { | |||
| case 6: // Cannon Lake | |||
| #ifndef NO_AVX512 | |||
| return CPUTYPE_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| #endif | |||
| } | |||
| break; | |||
| case 9: | |||
| case 8: | |||
| switch (model) { | |||
| @@ -102,7 +102,13 @@ clean :: | |||
| rm -f x* | |||
| FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||
| CEXTRALIB = | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB = -lomp | |||
| endif | |||
| endif | |||
| endif | |||
| # Single real | |||
| xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) | |||
| @@ -91,11 +91,7 @@ | |||
| #endif | |||
| typedef struct { | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| @@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; | |||
| for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { | |||
| /* Make sure if no one is using workspace */ | |||
| START_RPCC(); | |||
| for (i = 0; i < args -> nthreads; i++) | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; | |||
| STOP_RPCC(waiting1); | |||
| #if defined(FUSED_GEMM) && !defined(TIMING) | |||
| /* Fused operation to copy region of B into workspace and apply kernel */ | |||
| @@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| } | |||
| #endif | |||
| /* Set flag so other threads can access local region of B */ | |||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) | |||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { | |||
| /* Make sure if no one is using workspace */ | |||
| START_RPCC(); | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting1); | |||
| /* Set flag so other threads can access local region of B */ | |||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||
| WMB; | |||
| WMB; | |||
| } | |||
| } | |||
| /* Get regions of B from other threads and apply kernel */ | |||
| @@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Wait until other region of B is initialized */ | |||
| START_RPCC(); | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting2); | |||
| /* Apply kernel with local region of A and part of other region of B */ | |||
| @@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Clear synchronization flag if this thread is done with other region of B */ | |||
| if (m_to - m_from == min_i) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| WMB; | |||
| } | |||
| } | |||
| } while (current != mypos); | |||
| /* Iterate through steps of m | |||
| /* Iterate through steps of m | |||
| * Note: First step has already been finished */ | |||
| for(is = m_from + min_i; is < m_to; is += min_i){ | |||
| min_i = m_to - is; | |||
| @@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| c, ldc, is, js); | |||
| STOP_RPCC(kernel); | |||
| #ifdef TIMING | |||
| ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; | |||
| #endif | |||
| /* Clear synchronization flag if this thread is done with region of B */ | |||
| if (is + min_i >= m_to) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| WMB; | |||
| } | |||
| } | |||
| @@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| START_RPCC(); | |||
| for (i = 0; i < args -> nthreads; i++) { | |||
| for (js = 0; js < DIVIDE_RATE; js++) { | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; | |||
| } | |||
| } | |||
| STOP_RPCC(waiting3); | |||
| @@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| } | |||
| /* Clear synchronization flags */ | |||
| for (i = 0; i < MAX_CPU_NUMBER; i++) { | |||
| for (j = 0; j < MAX_CPU_NUMBER; j++) { | |||
| for (i = 0; i < nthreads; i++) { | |||
| for (j = 0; j < nthreads; j++) { | |||
| for (k = 0; k < DIVIDE_RATE; k++) { | |||
| job[i].working[j][CACHE_LINE_SIZE * k] = 0; | |||
| } | |||
| @@ -48,6 +48,10 @@ | |||
| #else | |||
| #ifndef OMP_SCHED | |||
| #define OMP_SCHED static | |||
| #endif | |||
| int blas_server_avail = 0; | |||
| static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; | |||
| @@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| break; | |||
| } | |||
| #pragma omp parallel for schedule(static) | |||
| #pragma omp parallel for schedule(OMP_SCHED) | |||
| for (i = 0; i < num; i ++) { | |||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| @@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ | |||
| idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) | |||
| CSBLAS1OBJS = \ | |||
| cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | |||
| cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | |||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | |||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | |||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) | |||
| @@ -277,7 +277,7 @@ CSBLAS3OBJS = \ | |||
| cblas_sgeadd.$(SUFFIX) | |||
| CDBLAS1OBJS = \ | |||
| cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | |||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | |||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) | |||
| @@ -294,7 +294,7 @@ CDBLAS3OBJS += \ | |||
| cblas_dgeadd.$(SUFFIX) | |||
| CCBLAS1OBJS = \ | |||
| cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ | |||
| cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ | |||
| cblas_ccopy.$(SUFFIX) \ | |||
| cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ | |||
| cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | |||
| @@ -320,7 +320,7 @@ CCBLAS3OBJS = \ | |||
| CZBLAS1OBJS = \ | |||
| cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||
| cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||
| cblas_zcopy.$(SUFFIX) \ | |||
| cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | |||
| cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | |||
| @@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c | |||
| cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| @@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 8 | |||
| #define SWITCH_RATIO 4 | |||
| #define SWITCH_RATIO 32 | |||
| #ifdef ARCH_X86 | |||
| @@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 8 | |||
| #define SWITCH_RATIO 4 | |||
| #define SWITCH_RATIO 32 | |||
| #ifdef ARCH_X86 | |||
| @@ -122,8 +122,13 @@ endif | |||
| FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||
| CEXTRALIB = | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB = -lomp | |||
| endif | |||
| endif | |||
| endif | |||
| sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) | |||
| @@ -25,7 +25,6 @@ endif () | |||
| # known to hang with the native Windows and Android threads | |||
| # FIXME needs checking if this works on any of the other platforms | |||
| if (NOT NO_CBLAS) | |||
| if (NOT USE_OPENMP) | |||
| if (OS_CYGWIN_NT OR OS_LINUX) | |||
| set(OpenBLAS_utest_src | |||
| @@ -34,7 +33,6 @@ set(OpenBLAS_utest_src | |||
| ) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (NOT NO_LAPACK) | |||
| set(OpenBLAS_utest_src | |||
| @@ -17,13 +17,11 @@ endif | |||
| #this does not work with OpenMP nor with native Windows or Android threads | |||
| # FIXME TBD if this works on OSX, SunOS, POWER and zarch | |||
| ifneq ($(NO_CBLAS), 1) | |||
| ifndef USE_OPENMP | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) | |||
| OBJS += test_fork.o | |||
| endif | |||
| endif | |||
| endif | |||
| all : run_test | |||
| @@ -13,9 +13,9 @@ met: | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| @@ -48,11 +48,13 @@ void* xmalloc(size_t n) | |||
| } | |||
| } | |||
| void check_dgemm(double *a, double *b, double *result, double *expected, int n) | |||
| void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) | |||
| { | |||
| char trans1 = 'T'; | |||
| char trans2 = 'N'; | |||
| double zerod = 0, oned = 1; | |||
| int i; | |||
| cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, | |||
| 1.0, a, n, b, n, 0.0, result, n); | |||
| BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n); | |||
| for(i = 0; i < n * n; ++i) { | |||
| ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); | |||
| } | |||
| @@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n) | |||
| CTEST(fork, safety) | |||
| { | |||
| int n = 1000; | |||
| blasint n = 1000; | |||
| int i; | |||
| double *a, *b, *c, *d; | |||
| @@ -84,8 +86,10 @@ CTEST(fork, safety) | |||
| // Compute a DGEMM product in the parent process prior to forking to | |||
| // ensure that the OpenBLAS thread pool is initialized. | |||
| cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, | |||
| 1.0, a, n, b, n, 0.0, c, n); | |||
| char trans1 = 'T'; | |||
| char trans2 = 'N'; | |||
| double zerod = 0, oned = 1; | |||
| BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n); | |||
| fork_pid = fork(); | |||
| if (fork_pid == -1) { | |||