Merge xianyi:develop into developtags/v0.3.1
| @@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1) | |||||
| RELA = re_lapack | RELA = re_lapack | ||||
| endif | endif | ||||
| ifeq ($(NO_FORTRAN), 1) | |||||
| define NOFORTRAN | |||||
| 1 | |||||
| endef | |||||
| define NO_LAPACK | |||||
| 1 | |||||
| endef | |||||
| export NOFORTRAN | |||||
| export NO_LAPACK | |||||
| endif | |||||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | ||||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | ||||
| @@ -47,7 +58,7 @@ endif | |||||
| endif | endif | ||||
| @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" | @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" | ||||
| ifndef NOFORTRAN | |||||
| ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) | |||||
| @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" | @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" | ||||
| endif | endif | ||||
| ifneq ($(OSNAME), AIX) | ifneq ($(OSNAME), AIX) | ||||
| @@ -108,7 +119,7 @@ endif | |||||
| endif | endif | ||||
| tests : | tests : | ||||
| ifndef NOFORTRAN | |||||
| ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) | |||||
| touch $(LIBNAME) | touch $(LIBNAME) | ||||
| ifndef NO_FBLAS | ifndef NO_FBLAS | ||||
| $(MAKE) -C test all | $(MAKE) -C test all | ||||
| @@ -210,7 +221,7 @@ netlib : | |||||
| else | else | ||||
| netlib : lapack_prebuild | netlib : lapack_prebuild | ||||
| ifndef NOFORTRAN | |||||
| ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) | |||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | ||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | ||||
| endif | endif | ||||
| @@ -231,7 +242,10 @@ prof_lapack : lapack_prebuild | |||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | ||||
| lapack_prebuild : | lapack_prebuild : | ||||
| ifndef NOFORTRAN | |||||
| $(info filter value of NOFORTRAN is:) | |||||
| $(info x$(filter-out $(NOFORTRAN), 1 2)x) | |||||
| ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) | |||||
| -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| @@ -274,21 +288,21 @@ endif | |||||
| endif | endif | ||||
| large.tgz : | large.tgz : | ||||
| ifndef NOFORTRAN | |||||
| ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) | |||||
| if [ ! -a $< ]; then | if [ ! -a $< ]; then | ||||
| -wget http://www.netlib.org/lapack/timing/large.tgz; | -wget http://www.netlib.org/lapack/timing/large.tgz; | ||||
| fi | fi | ||||
| endif | endif | ||||
| timing.tgz : | timing.tgz : | ||||
| ifndef NOFORTRAN | |||||
| ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) | |||||
| if [ ! -a $< ]; then | if [ ! -a $< ]; then | ||||
| -wget http://www.netlib.org/lapack/timing/timing.tgz; | -wget http://www.netlib.org/lapack/timing/timing.tgz; | ||||
| fi | fi | ||||
| endif | endif | ||||
| lapack-timing : large.tgz timing.tgz | lapack-timing : large.tgz timing.tgz | ||||
| ifndef NOFORTRAN | |||||
| ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) | |||||
| (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | ||||
| (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | ||||
| $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING | $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING | ||||
| @@ -60,6 +60,14 @@ VERSION = 0.3.1.dev | |||||
| # This flag is always set for POWER8. Don't modify the flag | # This flag is always set for POWER8. Don't modify the flag | ||||
| # USE_OPENMP = 1 | # USE_OPENMP = 1 | ||||
| # The OpenMP scheduler to use - by default this is "static" and you | |||||
| # will normally not want to change this unless you know that your main | |||||
| # workload will involve tasks that have highly unbalanced running times | |||||
| # for individual threads. Changing away from "static" may also adversely | |||||
| # affect memory access locality in NUMA systems. Setting to "runtime" will | |||||
| # allow you to select the scheduler from the environment variable OMP_SCHEDULE | |||||
| # CCOMMON_OPT += -DOMP_SCHED=dynamic | |||||
| # You can define maximum number of threads. Basically it should be | # You can define maximum number of threads. Basically it should be | ||||
| # less than actual number of cores. If you don't specify one, it's | # less than actual number of cores. If you don't specify one, it's | ||||
| # automatically detected by the the script. | # automatically detected by the the script. | ||||
| @@ -156,8 +164,11 @@ NO_AFFINITY = 1 | |||||
| # CONSISTENT_FPCSR = 1 | # CONSISTENT_FPCSR = 1 | ||||
| # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | ||||
| # with single thread. You can use this flag to avoid the overhead of multi-threading | |||||
| # in small matrix sizes. The default value is 4. | |||||
| # with single thread. (Actually in recent versions this is a factor proportional to the | |||||
| # number of floating point operations necessary for the given problem size, no longer | |||||
| # an individual dimension). You can use this setting to avoid the overhead of multi- | |||||
| # threading in small matrix sizes. The default value is 4, but values as high as 50 have | |||||
| # been reported to be optimal for certain workloads (50 is the recommended value for Julia). | |||||
| # GEMM_MULTITHREAD_THRESHOLD = 4 | # GEMM_MULTITHREAD_THRESHOLD = 4 | ||||
| # If you need santy check by comparing reference BLAS. It'll be very | # If you need santy check by comparing reference BLAS. It'll be very | ||||
| @@ -248,7 +248,7 @@ endif | |||||
| ifeq ($(OSNAME), Darwin) | ifeq ($(OSNAME), Darwin) | ||||
| ifndef MACOSX_DEPLOYMENT_TARGET | ifndef MACOSX_DEPLOYMENT_TARGET | ||||
| export MACOSX_DEPLOYMENT_TARGET=10.6 | |||||
| export MACOSX_DEPLOYMENT_TARGET=10.8 | |||||
| endif | endif | ||||
| MD5SUM = md5 -r | MD5SUM = md5 -r | ||||
| endif | endif | ||||
| @@ -8,6 +8,13 @@ endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(CORE), SKYLAKEX) | |||||
| ifndef NO_AVX512 | |||||
| CCOMMON_OPT += -march=skylake-avx512 | |||||
| FCOMMON_OPT += -march=skylake-avx512 | |||||
| endif | |||||
| endif | |||||
| ifeq ($(OSNAME), Interix) | ifeq ($(OSNAME), Interix) | ||||
| ARFLAGS = -m x64 | ARFLAGS = -m x64 | ||||
| endif | endif | ||||
| @@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||||
| CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
| CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
| CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||||
| CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||||
| CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | ||||
| @@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| * - large enough to support all architectures and kernel | * - large enough to support all architectures and kernel | ||||
| * Chosing a too small SIZE will lead to a stack smashing. | * Chosing a too small SIZE will lead to a stack smashing. | ||||
| */ | */ | ||||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | |||||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | |||||
| /* do not restore all register */ \ | |||||
| volatile int stack_alloc_size = SIZE; \ | |||||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ | |||||
| stack_alloc_size = 0; \ | |||||
| STACK_ALLOC_PROTECT_SET \ | |||||
| TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ | |||||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | |||||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | |||||
| /* do not restore all register */ \ | |||||
| volatile int stack_alloc_size = SIZE; \ | |||||
| if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \ | |||||
| STACK_ALLOC_PROTECT_SET \ | |||||
| /* Avoid declaring an array of length 0 */ \ | |||||
| TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \ | |||||
| __attribute__((aligned(0x20))); \ | |||||
| BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); | BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); | ||||
| #else | #else | ||||
| //Original OpenBLAS/GotoBLAS codes. | //Original OpenBLAS/GotoBLAS codes. | ||||
| @@ -60,8 +60,13 @@ | |||||
| #endif | #endif | ||||
| */ | */ | ||||
| #define MB | |||||
| #define WMB | |||||
| #ifdef __GNUC__ | |||||
| #define MB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||||
| #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||||
| #else | |||||
| #define MB do {} while (0) | |||||
| #define WMB do {} while (0) | |||||
| #endif | |||||
| static void __inline blas_lock(volatile BLASULONG *address){ | static void __inline blas_lock(volatile BLASULONG *address){ | ||||
| @@ -1339,6 +1339,23 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| } | } | ||||
| break; | break; | ||||
| case 6: | |||||
| switch (model) { | |||||
| case 6: // Cannon Lake | |||||
| #ifndef NO_AVX512 | |||||
| return CPUTYPE_SKYLAKEX; | |||||
| #else | |||||
| if(support_avx()) | |||||
| #ifndef NO_AVX2 | |||||
| return CPUTYPE_HASWELL; | |||||
| #else | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| #endif | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| #endif | |||||
| } | |||||
| break; | |||||
| case 9: | case 9: | ||||
| case 8: | case 8: | ||||
| switch (model) { | switch (model) { | ||||
| @@ -102,7 +102,13 @@ clean :: | |||||
| rm -f x* | rm -f x* | ||||
| FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | ||||
| CEXTRALIB = | |||||
| ifeq ($(USE_OPENMP), 1) | |||||
| ifeq ($(F_COMPILER), GFORTRAN) | |||||
| ifeq ($(C_COMPILER), CLANG) | |||||
| CEXTRALIB = -lomp | |||||
| endif | |||||
| endif | |||||
| endif | |||||
| # Single real | # Single real | ||||
| xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) | xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) | ||||
| @@ -91,11 +91,7 @@ | |||||
| #endif | #endif | ||||
| typedef struct { | typedef struct { | ||||
| #if __STDC_VERSION__ >= 201112L | |||||
| _Atomic | |||||
| #else | |||||
| volatile | volatile | ||||
| #endif | |||||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | ||||
| } job_t; | } job_t; | ||||
| @@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; | div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; | ||||
| for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { | for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { | ||||
| /* Make sure if no one is using workspace */ | |||||
| START_RPCC(); | |||||
| for (i = 0; i < args -> nthreads; i++) | |||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; | |||||
| STOP_RPCC(waiting1); | |||||
| #if defined(FUSED_GEMM) && !defined(TIMING) | #if defined(FUSED_GEMM) && !defined(TIMING) | ||||
| /* Fused operation to copy region of B into workspace and apply kernel */ | /* Fused operation to copy region of B into workspace and apply kernel */ | ||||
| @@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| } | } | ||||
| #endif | #endif | ||||
| /* Set flag so other threads can access local region of B */ | |||||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) | |||||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { | |||||
| /* Make sure if no one is using workspace */ | |||||
| START_RPCC(); | |||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||||
| STOP_RPCC(waiting1); | |||||
| /* Set flag so other threads can access local region of B */ | |||||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | ||||
| WMB; | |||||
| WMB; | |||||
| } | |||||
| } | } | ||||
| /* Get regions of B from other threads and apply kernel */ | /* Get regions of B from other threads and apply kernel */ | ||||
| @@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| /* Wait until other region of B is initialized */ | /* Wait until other region of B is initialized */ | ||||
| START_RPCC(); | START_RPCC(); | ||||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; | |||||
| STOP_RPCC(waiting2); | STOP_RPCC(waiting2); | ||||
| /* Apply kernel with local region of A and part of other region of B */ | /* Apply kernel with local region of A and part of other region of B */ | ||||
| @@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| /* Clear synchronization flag if this thread is done with other region of B */ | /* Clear synchronization flag if this thread is done with other region of B */ | ||||
| if (m_to - m_from == min_i) { | if (m_to - m_from == min_i) { | ||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||||
| WMB; | |||||
| } | } | ||||
| } | } | ||||
| } while (current != mypos); | } while (current != mypos); | ||||
| /* Iterate through steps of m | |||||
| /* Iterate through steps of m | |||||
| * Note: First step has already been finished */ | * Note: First step has already been finished */ | ||||
| for(is = m_from + min_i; is < m_to; is += min_i){ | for(is = m_from + min_i; is < m_to; is += min_i){ | ||||
| min_i = m_to - is; | min_i = m_to - is; | ||||
| @@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | ||||
| c, ldc, is, js); | c, ldc, is, js); | ||||
| STOP_RPCC(kernel); | STOP_RPCC(kernel); | ||||
| #ifdef TIMING | #ifdef TIMING | ||||
| ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; | ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; | ||||
| #endif | #endif | ||||
| /* Clear synchronization flag if this thread is done with region of B */ | /* Clear synchronization flag if this thread is done with region of B */ | ||||
| if (is + min_i >= m_to) { | if (is + min_i >= m_to) { | ||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||||
| WMB; | WMB; | ||||
| } | } | ||||
| } | } | ||||
| @@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| START_RPCC(); | START_RPCC(); | ||||
| for (i = 0; i < args -> nthreads; i++) { | for (i = 0; i < args -> nthreads; i++) { | ||||
| for (js = 0; js < DIVIDE_RATE; js++) { | for (js = 0; js < DIVIDE_RATE; js++) { | ||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; | |||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; | |||||
| } | } | ||||
| } | } | ||||
| STOP_RPCC(waiting3); | STOP_RPCC(waiting3); | ||||
| @@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| } | } | ||||
| /* Clear synchronization flags */ | /* Clear synchronization flags */ | ||||
| for (i = 0; i < MAX_CPU_NUMBER; i++) { | |||||
| for (j = 0; j < MAX_CPU_NUMBER; j++) { | |||||
| for (i = 0; i < nthreads; i++) { | |||||
| for (j = 0; j < nthreads; j++) { | |||||
| for (k = 0; k < DIVIDE_RATE; k++) { | for (k = 0; k < DIVIDE_RATE; k++) { | ||||
| job[i].working[j][CACHE_LINE_SIZE * k] = 0; | job[i].working[j][CACHE_LINE_SIZE * k] = 0; | ||||
| } | } | ||||
| @@ -48,6 +48,10 @@ | |||||
| #else | #else | ||||
| #ifndef OMP_SCHED | |||||
| #define OMP_SCHED static | |||||
| #endif | |||||
| int blas_server_avail = 0; | int blas_server_avail = 0; | ||||
| static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; | static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; | ||||
| @@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||||
| break; | break; | ||||
| } | } | ||||
| #pragma omp parallel for schedule(static) | |||||
| #pragma omp parallel for schedule(OMP_SCHED) | |||||
| for (i = 0; i < num; i ++) { | for (i = 0; i < num; i ++) { | ||||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | #ifndef USE_SIMPLE_THREADED_LEVEL3 | ||||
| @@ -338,6 +338,23 @@ static gotoblas_t *get_coretype(void){ | |||||
| return &gotoblas_NEHALEM; | return &gotoblas_NEHALEM; | ||||
| } | } | ||||
| return NULL; | return NULL; | ||||
| case 6: | |||||
| if (model == 6) { | |||||
| // Cannon Lake | |||||
| #ifndef NO_AVX512 | |||||
| return &gotoblas_SKYLAKEX; | |||||
| #else | |||||
| if(support_avx()) | |||||
| #ifndef NO_AVX2 | |||||
| return &gotoblas_HASWELL; | |||||
| #else | |||||
| return &gotblas_SANDYBRIDGE; | |||||
| #endif | |||||
| else | |||||
| return &gotoblas_NEHALEM; | |||||
| #endif | |||||
| } | |||||
| return NULL; | |||||
| case 9: | case 9: | ||||
| case 8: | case 8: | ||||
| if (model == 14 ) { // Kaby Lake | if (model == 14 ) { // Kaby Lake | ||||
| @@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ | |||||
| idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) | idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) | ||||
| CSBLAS1OBJS = \ | CSBLAS1OBJS = \ | ||||
| cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | |||||
| cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | |||||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | ||||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | ||||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) | cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) | ||||
| @@ -277,7 +277,7 @@ CSBLAS3OBJS = \ | |||||
| cblas_sgeadd.$(SUFFIX) | cblas_sgeadd.$(SUFFIX) | ||||
| CDBLAS1OBJS = \ | CDBLAS1OBJS = \ | ||||
| cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||||
| cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | ||||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | ||||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) | cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) | ||||
| @@ -294,7 +294,7 @@ CDBLAS3OBJS += \ | |||||
| cblas_dgeadd.$(SUFFIX) | cblas_dgeadd.$(SUFFIX) | ||||
| CCBLAS1OBJS = \ | CCBLAS1OBJS = \ | ||||
| cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ | |||||
| cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ | |||||
| cblas_ccopy.$(SUFFIX) \ | cblas_ccopy.$(SUFFIX) \ | ||||
| cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ | cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ | ||||
| cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | ||||
| @@ -320,7 +320,7 @@ CCBLAS3OBJS = \ | |||||
| CZBLAS1OBJS = \ | CZBLAS1OBJS = \ | ||||
| cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||||
| cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||||
| cblas_zcopy.$(SUFFIX) \ | cblas_zcopy.$(SUFFIX) \ | ||||
| cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | ||||
| cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | ||||
| @@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c | |||||
| cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c | cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | ||||
| cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c | cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | ||||
| @@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| #define SWITCH_RATIO 4 | |||||
| #define SWITCH_RATIO 32 | |||||
| #ifdef ARCH_X86 | #ifdef ARCH_X86 | ||||
| @@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| #define SWITCH_RATIO 4 | |||||
| #define SWITCH_RATIO 32 | |||||
| #ifdef ARCH_X86 | #ifdef ARCH_X86 | ||||
| @@ -122,8 +122,13 @@ endif | |||||
| FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | ||||
| CEXTRALIB = | |||||
| ifeq ($(USE_OPENMP), 1) | |||||
| ifeq ($(F_COMPILER), GFORTRAN) | |||||
| ifeq ($(C_COMPILER), CLANG) | |||||
| CEXTRALIB = -lomp | |||||
| endif | |||||
| endif | |||||
| endif | |||||
| sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) | sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) | ||||
| $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) | $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) | ||||
| @@ -25,7 +25,6 @@ endif () | |||||
| # known to hang with the native Windows and Android threads | # known to hang with the native Windows and Android threads | ||||
| # FIXME needs checking if this works on any of the other platforms | # FIXME needs checking if this works on any of the other platforms | ||||
| if (NOT NO_CBLAS) | |||||
| if (NOT USE_OPENMP) | if (NOT USE_OPENMP) | ||||
| if (OS_CYGWIN_NT OR OS_LINUX) | if (OS_CYGWIN_NT OR OS_LINUX) | ||||
| set(OpenBLAS_utest_src | set(OpenBLAS_utest_src | ||||
| @@ -34,7 +33,6 @@ set(OpenBLAS_utest_src | |||||
| ) | ) | ||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| endif() | |||||
| if (NOT NO_LAPACK) | if (NOT NO_LAPACK) | ||||
| set(OpenBLAS_utest_src | set(OpenBLAS_utest_src | ||||
| @@ -17,13 +17,11 @@ endif | |||||
| #this does not work with OpenMP nor with native Windows or Android threads | #this does not work with OpenMP nor with native Windows or Android threads | ||||
| # FIXME TBD if this works on OSX, SunOS, POWER and zarch | # FIXME TBD if this works on OSX, SunOS, POWER and zarch | ||||
| ifneq ($(NO_CBLAS), 1) | |||||
| ifndef USE_OPENMP | ifndef USE_OPENMP | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) | ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) | ||||
| OBJS += test_fork.o | OBJS += test_fork.o | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| all : run_test | all : run_test | ||||
| @@ -13,9 +13,9 @@ met: | |||||
| notice, this list of conditions and the following disclaimer in | notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | the documentation and/or other materials provided with the | ||||
| distribution. | distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| @@ -48,11 +48,13 @@ void* xmalloc(size_t n) | |||||
| } | } | ||||
| } | } | ||||
| void check_dgemm(double *a, double *b, double *result, double *expected, int n) | |||||
| void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) | |||||
| { | { | ||||
| char trans1 = 'T'; | |||||
| char trans2 = 'N'; | |||||
| double zerod = 0, oned = 1; | |||||
| int i; | int i; | ||||
| cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, | |||||
| 1.0, a, n, b, n, 0.0, result, n); | |||||
| BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n); | |||||
| for(i = 0; i < n * n; ++i) { | for(i = 0; i < n * n; ++i) { | ||||
| ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); | ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); | ||||
| } | } | ||||
| @@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n) | |||||
| CTEST(fork, safety) | CTEST(fork, safety) | ||||
| { | { | ||||
| int n = 1000; | |||||
| blasint n = 1000; | |||||
| int i; | int i; | ||||
| double *a, *b, *c, *d; | double *a, *b, *c, *d; | ||||
| @@ -84,8 +86,10 @@ CTEST(fork, safety) | |||||
| // Compute a DGEMM product in the parent process prior to forking to | // Compute a DGEMM product in the parent process prior to forking to | ||||
| // ensure that the OpenBLAS thread pool is initialized. | // ensure that the OpenBLAS thread pool is initialized. | ||||
| cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, | |||||
| 1.0, a, n, b, n, 0.0, c, n); | |||||
| char trans1 = 'T'; | |||||
| char trans2 = 'N'; | |||||
| double zerod = 0, oned = 1; | |||||
| BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n); | |||||
| fork_pid = fork(); | fork_pid = fork(); | ||||
| if (fork_pid == -1) { | if (fork_pid == -1) { | ||||