Update the 0.3 branch from developtags/v0.3.1
| @@ -6,12 +6,15 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 0.dev) | |||
| set(OpenBLAS_PATCH_VERSION 1.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| include(GNUInstallDirs) | |||
| include(CMakePackageConfigHelpers) | |||
| set(OpenBLAS_LIBNAME openblas) | |||
| ####### | |||
| @@ -20,6 +23,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON | |||
| endif() | |||
| option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) | |||
| option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) | |||
| option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF) | |||
| option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) | |||
| ####### | |||
| if(BUILD_WITHOUT_LAPACK) | |||
| @@ -208,6 +212,7 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||
| # Install libraries | |||
| install(TARGETS ${OpenBLAS_LIBNAME} | |||
| EXPORT "OpenBLASTargets" | |||
| RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
| @@ -267,3 +272,21 @@ if(PKG_CONFIG_FOUND) | |||
| configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) | |||
| install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) | |||
| endif() | |||
| # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". | |||
| set(PN OpenBLAS) | |||
| set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}") | |||
| configure_package_config_file(cmake/${PN}Config.cmake.in | |||
| "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake" | |||
| INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
| VERSION ${${PN}_VERSION} | |||
| COMPATIBILITY AnyNewerVersion) | |||
| install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake | |||
| ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| install(EXPORT "${PN}Targets" | |||
| NAMESPACE "${PN}::" | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| @@ -153,6 +153,9 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
| do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | |||
| done | |||
| @echo DYNAMIC_ARCH=1 >> Makefile.conf_last | |||
| ifeq ($(DYNAMIC_OLDER), 1) | |||
| @echo DYNAMIC_OLDER=1 >> Makefile.conf_last | |||
| endif | |||
| endif | |||
| ifdef USE_THREAD | |||
| @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last | |||
| @@ -294,9 +297,10 @@ endif | |||
| lapack-test : | |||
| (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) | |||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz | |||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||
| ifneq ($(CROSS), 1) | |||
| ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ | |||
| ( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \ | |||
| ./testsecond; ./testdsecnd; ./testieee; ./testversion ) | |||
| (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | |||
| endif | |||
| @@ -308,9 +312,9 @@ lapack-runtest: | |||
| blas-test: | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) | |||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) | |||
| dummy : | |||
| @@ -98,7 +98,7 @@ endif | |||
| @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" | |||
| @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.0.dev | |||
| VERSION = 0.3.1.dev | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -17,6 +17,11 @@ VERSION = 0.3.0.dev | |||
| # If you want to support multiple architecture in one binary | |||
| # DYNAMIC_ARCH = 1 | |||
| # If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH | |||
| # mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON, | |||
| # OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures) | |||
| # DYNAMIC_OLDER = 1 | |||
| # C compiler including binary type(32bit / 64bit). Default is gcc. | |||
| # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. | |||
| # CC = gcc | |||
| @@ -55,6 +60,14 @@ VERSION = 0.3.0.dev | |||
| # This flag is always set for POWER8. Don't modify the flag | |||
| # USE_OPENMP = 1 | |||
| # The OpenMP scheduler to use - by default this is "static" and you | |||
| # will normally not want to change this unless you know that your main | |||
| # workload will involve tasks that have highly unbalanced running times | |||
| # for individual threads. Changing away from "static" may also adversely | |||
| # affect memory access locality in NUMA systems. Setting to "runtime" will | |||
| # allow you to select the scheduler from the environment variable OMP_SCHEDULE | |||
| # CCOMMON_OPT += -DOMP_SCHED=dynamic | |||
| # You can define maximum number of threads. Basically it should be | |||
| # less than actual number of cores. If you don't specify one, it's | |||
| # automatically detected by the the script. | |||
| @@ -151,8 +164,11 @@ NO_AFFINITY = 1 | |||
| # CONSISTENT_FPCSR = 1 | |||
| # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | |||
| # with single thread. You can use this flag to avoid the overhead of multi-threading | |||
| # in small matrix sizes. The default value is 4. | |||
| # with single thread. (Actually in recent versions this is a factor proportional to the | |||
| # number of floating point operations necessary for the given problem size, no longer | |||
| # an individual dimension). You can use this setting to avoid the overhead of multi- | |||
| # threading in small matrix sizes. The default value is 4, but values as high as 50 have | |||
| # been reported to be optimal for certain workloads (50 is the recommended value for Julia). | |||
| # GEMM_MULTITHREAD_THRESHOLD = 4 | |||
| # If you need santy check by comparing reference BLAS. It'll be very | |||
| @@ -62,6 +62,9 @@ ifeq ($(BINARY), 32) | |||
| ifeq ($(TARGET), HASWELL) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), SKYLAKEX) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| @@ -95,6 +98,9 @@ ifeq ($(BINARY), 32) | |||
| ifeq ($(TARGET_CORE), HASWELL) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), SKYLAKEX) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| @@ -141,6 +147,10 @@ ifeq ($(NO_AVX2), 1) | |||
| GETARCH_FLAGS += -DNO_AVX2 | |||
| endif | |||
| ifeq ($(NO_AVX512), 1) | |||
| GETARCH_FLAGS += -DNO_AVX512 | |||
| endif | |||
| ifeq ($(DEBUG), 1) | |||
| GETARCH_FLAGS += -g | |||
| endif | |||
| @@ -238,7 +248,7 @@ endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| ifndef MACOSX_DEPLOYMENT_TARGET | |||
| export MACOSX_DEPLOYMENT_TARGET=10.6 | |||
| export MACOSX_DEPLOYMENT_TARGET=10.8 | |||
| endif | |||
| MD5SUM = md5 -r | |||
| endif | |||
| @@ -462,13 +472,37 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| DYNAMIC_CORE = PRESCOTT CORE2 | |||
| ifeq ($(DYNAMIC_OLDER), 1) | |||
| DYNAMIC_CORE += PENRYN DUNNINGTON | |||
| endif | |||
| DYNAMIC_CORE += NEHALEM | |||
| ifeq ($(DYNAMIC_OLDER), 1) | |||
| DYNAMIC_CORE += OPTERON OPTERON_SSE3 | |||
| endif | |||
| DYNAMIC_CORE += BARCELONA | |||
| ifeq ($(DYNAMIC_OLDER), 1) | |||
| DYNAMIC_CORE += BOBCAT ATOM NANO | |||
| endif | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR | |||
| endif | |||
| ifneq ($(NO_AVX2), 1) | |||
| DYNAMIC_CORE += HASWELL ZEN | |||
| endif | |||
| ifneq ($(NO_AVX512), 1) | |||
| ifneq ($(NO_AVX2), 1) | |||
| DYNAMIC_CORE += SKYLAKEX | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef DYNAMIC_LIST | |||
| override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST) | |||
| XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT | |||
| XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) | |||
| CCOMMON_OPT += $(XCCOMMON_OPT) | |||
| #CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' | |||
| endif | |||
| # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | |||
| @@ -902,6 +936,10 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
| CCOMMON_OPT += -DDYNAMIC_ARCH | |||
| endif | |||
| ifeq ($(DYNAMIC_OLDER), 1) | |||
| CCOMMON_OPT += -DDYNAMIC_OLDER | |||
| endif | |||
| ifeq ($(NO_LAPACK), 1) | |||
| CCOMMON_OPT += -DNO_LAPACK | |||
| #Disable LAPACK C interface | |||
| @@ -924,6 +962,10 @@ ifeq ($(NO_AVX2), 1) | |||
| CCOMMON_OPT += -DNO_AVX2 | |||
| endif | |||
| ifeq ($(NO_AVX512), 1) | |||
| CCOMMON_OPT += -DNO_AVX512 | |||
| endif | |||
| ifdef SMP | |||
| CCOMMON_OPT += -DSMP_SERVER | |||
| @@ -1230,6 +1272,7 @@ export MSA_FLAGS | |||
| export KERNELDIR | |||
| export FUNCTION_PROFILE | |||
| export TARGET_CORE | |||
| export NO_AVX512 | |||
| export SGEMM_UNROLL_M | |||
| export SGEMM_UNROLL_N | |||
| @@ -8,6 +8,13 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), SKYLAKEX) | |||
| ifndef NO_AVX512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), Interix) | |||
| ARFLAGS = -m x64 | |||
| endif | |||
| @@ -20,6 +20,7 @@ DUNNINGTON | |||
| NEHALEM | |||
| SANDYBRIDGE | |||
| HASWELL | |||
| SKYLAKEX | |||
| ATOM | |||
| b)AMD CPU: | |||
| @@ -201,6 +201,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $binformat = bin32; | |||
| $binformat = bin64 if ($data =~ /BINARY_64/); | |||
| $no_avx512= 0; | |||
| if (($architecture eq "x86") || ($architecture eq "x86_64")) { | |||
| $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||
| print $tmpf "int main(void){ __asm__ volatile($code); }\n"; | |||
| $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_avx512 = 1; | |||
| } else { | |||
| $no_avx512 = 0; | |||
| } | |||
| unlink("tmpf.o"); | |||
| } | |||
| $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | |||
| $data =~ /globl\s([_\.]*)(.*)/; | |||
| @@ -288,6 +303,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0; | |||
| print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; | |||
| print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; | |||
| print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; | |||
| print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; | |||
| $os =~ tr/[a-z]/[A-Z]/; | |||
| $architecture =~ tr/[a-z]/[A-Z]/; | |||
| @@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||
| CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| @@ -0,0 +1,79 @@ | |||
| # OpenBLASConfig.cmake | |||
| # -------------------- | |||
| # | |||
| # OpenBLAS cmake module. | |||
| # This module sets the following variables in your project:: | |||
| # | |||
| # OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system | |||
| # OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release | |||
| # OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located. | |||
| # OpenBLAS_INCLUDE_DIR - same as DIRS | |||
| # OpenBLAS_LIBRARIES - OpenBLAS library to link against. | |||
| # OpenBLAS_LIBRARY - same as LIBRARIES | |||
| # | |||
| # | |||
| # Available components:: | |||
| # | |||
| ## shared - search for only shared library | |||
| ## static - search for only static library | |||
| # serial - search for unthreaded library | |||
| # pthread - search for native pthread threaded library | |||
| # openmp - search for OpenMP threaded library | |||
| # | |||
| # | |||
| # Exported targets:: | |||
| # | |||
| # If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED` | |||
| ## target. Target is shared _or_ static, so, for both, use separate, not | |||
| ## overlapping, installations. :: | |||
| # | |||
| # OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached. | |||
| # | |||
| # | |||
| # Suggested usage:: | |||
| # | |||
| # find_package(OpenBLAS) | |||
| # find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread) | |||
| # | |||
| # | |||
| # The following variables can be set to guide the search for this package:: | |||
| # | |||
| # OpenBLAS_DIR - CMake variable, set to directory containing this Config file | |||
| # CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package | |||
| # PATH - environment variable, set to bin directory of this package | |||
| # CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables | |||
| # find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build | |||
| @PACKAGE_INIT@ | |||
| set(PN OpenBLAS) | |||
| # need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon | |||
| if(@USE_OPENMP@) | |||
| set(${PN}_openmp_FOUND 1) | |||
| elseif(@USE_THREAD@) | |||
| set(${PN}_pthread_FOUND 1) | |||
| else() | |||
| set(${PN}_serial_FOUND 1) | |||
| endif() | |||
| check_required_components(${PN}) | |||
| #----------------------------------------------------------------------------- | |||
| # Don't include targets if this file is being picked up by another | |||
| # project which has already built this as a subproject | |||
| #----------------------------------------------------------------------------- | |||
| if(NOT TARGET ${PN}::OpenBLAS) | |||
| include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake") | |||
| get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION) | |||
| set(${PN}_LIBRARY ${_loc}) | |||
| get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES) | |||
| set(${PN}_LIBRARIES ${_ill}) | |||
| get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES) | |||
| set(${PN}_INCLUDE_DIR ${_id}) | |||
| get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES) | |||
| set(${PN}_INCLUDE_DIRS ${_iid}) | |||
| endif() | |||
| @@ -49,13 +49,27 @@ if (DYNAMIC_ARCH) | |||
| endif () | |||
| if (X86_64) | |||
| set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) | |||
| set(DYNAMIC_CORE PRESCOTT CORE2) | |||
| if (DYNAMIC_OLDER) | |||
| set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON) | |||
| endif () | |||
| set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM) | |||
| if (DYNAMIC_OLDER) | |||
| set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3) | |||
| endif () | |||
| set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA) | |||
| if (DYNAMIC_OLDER) | |||
| set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO) | |||
| endif () | |||
| if (NOT NO_AVX) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR) | |||
| endif () | |||
| if (NOT NO_AVX2) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) | |||
| endif () | |||
| if (NOT NO_AVX512) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) | |||
| endif () | |||
| endif () | |||
| if (NOT DYNAMIC_CORE) | |||
| @@ -1,7 +1,7 @@ | |||
| libdir=@CMAKE_INSTALL_FULL_LIBDIR@ | |||
| includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ | |||
| openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ | |||
| openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ | |||
| Name: OpenBLAS | |||
| Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version | |||
| Version: @OPENBLAS_VERSION@ | |||
| @@ -33,7 +33,7 @@ endif () | |||
| if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
| set(NO_AVX 1) | |||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") | |||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX") | |||
| set(TARGET "NEHALEM") | |||
| endif () | |||
| if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||
| @@ -163,6 +163,9 @@ endif () | |||
| if (DYNAMIC_ARCH) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | |||
| if (DYNAMIC_OLDER) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | |||
| endif () | |||
| endif () | |||
| if (NO_LAPACK) | |||
| @@ -66,3 +66,12 @@ else() | |||
| set(BINARY32 1) | |||
| endif() | |||
| if (X86_64 OR X86) | |||
| file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) | |||
| if (NO_AVX512 EQUAL 1) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | |||
| endif() | |||
| file(REMOVE "avx512.tmp" "avx512.o") | |||
| endif() | |||
| @@ -642,6 +642,7 @@ void gotoblas_profile_init(void); | |||
| void gotoblas_profile_quit(void); | |||
| #ifdef USE_OPENMP | |||
| #ifndef C_MSVC | |||
| int omp_in_parallel(void); | |||
| int omp_get_num_procs(void); | |||
| @@ -649,12 +650,21 @@ int omp_get_num_procs(void); | |||
| __declspec(dllimport) int __cdecl omp_in_parallel(void); | |||
| __declspec(dllimport) int __cdecl omp_get_num_procs(void); | |||
| #endif | |||
| #if (__STDC_VERSION__ >= 201112L) | |||
| #if defined(C_GCC) && ( __GNUC__ < 7) | |||
| // workaround for GCC bug 65467 | |||
| #ifndef _Atomic | |||
| #define _Atomic volatile | |||
| #endif | |||
| #endif | |||
| #include <stdatomic.h> | |||
| #else | |||
| #ifndef _Atomic | |||
| #define _Atomic volatile | |||
| #endif | |||
| #endif | |||
| #else | |||
| #ifdef __ELF__ | |||
| int omp_in_parallel (void) __attribute__ ((weak)); | |||
| @@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * - large enough to support all architectures and kernel | |||
| * Chosing a too small SIZE will lead to a stack smashing. | |||
| */ | |||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | |||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | |||
| /* do not restore all register */ \ | |||
| volatile int stack_alloc_size = SIZE; \ | |||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ | |||
| stack_alloc_size = 0; \ | |||
| STACK_ALLOC_PROTECT_SET \ | |||
| TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ | |||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | |||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | |||
| /* do not restore all register */ \ | |||
| volatile int stack_alloc_size = SIZE; \ | |||
| if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \ | |||
| STACK_ALLOC_PROTECT_SET \ | |||
| /* Avoid declaring an array of length 0 */ \ | |||
| TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \ | |||
| __attribute__((aligned(0x20))); \ | |||
| BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); | |||
| #else | |||
| //Original OpenBLAS/GotoBLAS codes. | |||
| @@ -60,8 +60,13 @@ | |||
| #endif | |||
| */ | |||
| #define MB | |||
| #define WMB | |||
| #ifdef __GNUC__ | |||
| #define MB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||
| #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||
| #else | |||
| #define MB do {} while (0) | |||
| #define WMB do {} while (0) | |||
| #endif | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| @@ -115,6 +115,7 @@ | |||
| #define CORE_STEAMROLLER 25 | |||
| #define CORE_EXCAVATOR 26 | |||
| #define CORE_ZEN 27 | |||
| #define CORE_SKYLAKEX 28 | |||
| #define HAVE_SSE (1 << 0) | |||
| #define HAVE_SSE2 (1 << 1) | |||
| @@ -137,6 +138,7 @@ | |||
| #define HAVE_AVX (1 << 18) | |||
| #define HAVE_FMA4 (1 << 19) | |||
| #define HAVE_FMA3 (1 << 20) | |||
| #define HAVE_AVX512VL (1 << 21) | |||
| #define CACHE_INFO_L1_I 1 | |||
| #define CACHE_INFO_L1_D 2 | |||
| @@ -211,5 +213,6 @@ typedef struct { | |||
| #define CPUTYPE_STEAMROLLER 49 | |||
| #define CPUTYPE_EXCAVATOR 50 | |||
| #define CPUTYPE_ZEN 51 | |||
| #define CPUTYPE_SKYLAKEX 52 | |||
| #endif | |||
| @@ -50,6 +50,8 @@ | |||
| #ifdef NO_AVX | |||
| #define CPUTYPE_HASWELL CPUTYPE_NEHALEM | |||
| #define CORE_HASWELL CORE_NEHALEM | |||
| #define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM | |||
| #define CORE_SKYLAKEX CORE_NEHALEM | |||
| #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | |||
| #define CORE_SANDYBRIDGE CORE_NEHALEM | |||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | |||
| @@ -1299,6 +1301,19 @@ int get_cpuname(void){ | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 5: | |||
| // Skylake X | |||
| #ifndef NO_AVX512 | |||
| return CPUTYPE_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| #endif | |||
| case 14: | |||
| // Skylake | |||
| if(support_avx()) | |||
| @@ -1324,6 +1339,23 @@ int get_cpuname(void){ | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 6: | |||
| switch (model) { | |||
| case 6: // Cannon Lake | |||
| #ifndef NO_AVX512 | |||
| return CPUTYPE_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| #endif | |||
| } | |||
| break; | |||
| case 9: | |||
| case 8: | |||
| switch (model) { | |||
| @@ -1556,6 +1588,7 @@ static char *cpuname[] = { | |||
| "STEAMROLLER", | |||
| "EXCAVATOR", | |||
| "ZEN", | |||
| "SKYLAKEX" | |||
| }; | |||
| static char *lowercpuname[] = { | |||
| @@ -1610,6 +1643,7 @@ static char *lowercpuname[] = { | |||
| "steamroller", | |||
| "excavator", | |||
| "zen", | |||
| "skylakex" | |||
| }; | |||
| static char *corename[] = { | |||
| @@ -1641,6 +1675,7 @@ static char *corename[] = { | |||
| "STEAMROLLER", | |||
| "EXCAVATOR", | |||
| "ZEN", | |||
| "SKYLAKEX" | |||
| }; | |||
| static char *corename_lower[] = { | |||
| @@ -1672,6 +1707,7 @@ static char *corename_lower[] = { | |||
| "steamroller", | |||
| "excavator", | |||
| "zen", | |||
| "skylakex" | |||
| }; | |||
| @@ -1860,6 +1896,19 @@ int get_coretype(void){ | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 5: | |||
| // Skylake X | |||
| #ifndef NO_AVX512 | |||
| return CORE_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| case 14: | |||
| // Skylake | |||
| if(support_avx()) | |||
| @@ -102,7 +102,13 @@ clean :: | |||
| rm -f x* | |||
| FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||
| CEXTRALIB = | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB = -lomp | |||
| endif | |||
| endif | |||
| endif | |||
| # Single real | |||
| xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) | |||
| @@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -91,11 +91,7 @@ | |||
| #endif | |||
| typedef struct { | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| @@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; | |||
| for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { | |||
| /* Make sure if no one is using workspace */ | |||
| START_RPCC(); | |||
| for (i = 0; i < args -> nthreads; i++) | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; | |||
| STOP_RPCC(waiting1); | |||
| #if defined(FUSED_GEMM) && !defined(TIMING) | |||
| /* Fused operation to copy region of B into workspace and apply kernel */ | |||
| @@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| } | |||
| #endif | |||
| /* Set flag so other threads can access local region of B */ | |||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) | |||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { | |||
| /* Make sure if no one is using workspace */ | |||
| START_RPCC(); | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting1); | |||
| /* Set flag so other threads can access local region of B */ | |||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||
| WMB; | |||
| WMB; | |||
| } | |||
| } | |||
| /* Get regions of B from other threads and apply kernel */ | |||
| @@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Wait until other region of B is initialized */ | |||
| START_RPCC(); | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting2); | |||
| /* Apply kernel with local region of A and part of other region of B */ | |||
| @@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Clear synchronization flag if this thread is done with other region of B */ | |||
| if (m_to - m_from == min_i) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| WMB; | |||
| } | |||
| } | |||
| } while (current != mypos); | |||
| /* Iterate through steps of m | |||
| /* Iterate through steps of m | |||
| * Note: First step has already been finished */ | |||
| for(is = m_from + min_i; is < m_to; is += min_i){ | |||
| min_i = m_to - is; | |||
| @@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| c, ldc, is, js); | |||
| STOP_RPCC(kernel); | |||
| #ifdef TIMING | |||
| ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; | |||
| #endif | |||
| /* Clear synchronization flag if this thread is done with region of B */ | |||
| if (is + min_i >= m_to) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| WMB; | |||
| } | |||
| } | |||
| @@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| START_RPCC(); | |||
| for (i = 0; i < args -> nthreads; i++) { | |||
| for (js = 0; js < DIVIDE_RATE; js++) { | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; | |||
| } | |||
| } | |||
| STOP_RPCC(waiting3); | |||
| @@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| } | |||
| /* Clear synchronization flags */ | |||
| for (i = 0; i < MAX_CPU_NUMBER; i++) { | |||
| for (j = 0; j < MAX_CPU_NUMBER; j++) { | |||
| for (i = 0; i < nthreads; i++) { | |||
| for (j = 0; j < nthreads; j++) { | |||
| for (k = 0; k < DIVIDE_RATE; k++) { | |||
| job[i].working[j][CACHE_LINE_SIZE * k] = 0; | |||
| } | |||
| @@ -48,6 +48,10 @@ | |||
| #else | |||
| #ifndef OMP_SCHED | |||
| #define OMP_SCHED static | |||
| #endif | |||
| int blas_server_avail = 0; | |||
| static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; | |||
| @@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| break; | |||
| } | |||
| #pragma omp parallel for schedule(static) | |||
| #pragma omp parallel for schedule(OMP_SCHED) | |||
| for (i = 0; i < num; i ++) { | |||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| @@ -49,6 +49,167 @@ | |||
| #define EXTERN | |||
| #endif | |||
| #ifdef DYNAMIC_LIST | |||
| extern gotoblas_t gotoblas_PRESCOTT; | |||
| #ifdef DYN_ATHLON | |||
| extern gotoblas_t gotoblas_ATHLON; | |||
| #else | |||
| #define gotoblas_ATHLON gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_KATMAI | |||
| extern gotoblas_t gotoblas_KATMAI; | |||
| #else | |||
| #define gotoblas_KATMAI gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_BANIAS | |||
| extern gotoblas_t gotoblas_BANIAS; | |||
| #else | |||
| #define gotoblas_BANIAS gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_COPPERMINE | |||
| extern gotoblas_t gotoblas_COPPERMINE; | |||
| #else | |||
| #define gotoblas_COPPERMINE gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_NORTHWOOD | |||
| extern gotoblas_t gotoblas_NORTHWOOD; | |||
| #else | |||
| #define gotoblas_NORTHWOOD gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_CORE2 | |||
| extern gotoblas_t gotoblas_CORE2; | |||
| #else | |||
| #define gotoblas_CORE2 gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_NEHALEM | |||
| extern gotoblas_t gotoblas_NEHALEM; | |||
| #else | |||
| #define gotoblas_NEHALEM gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_BARCELONA | |||
| extern gotoblas_t gotoblas_BARCELONA; | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_BARCELONA gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_BARCELONA gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_ATOM | |||
| extern gotoblas_t gotoblas_ATOM; | |||
| elif defined(DYN_NEHALEM) | |||
| #define gotoblas_ATOM gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_ATOM gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_NANO | |||
| extern gotoblas_t gotoblas_NANO; | |||
| #else | |||
| #define gotoblas_NANO gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_PENRYN | |||
| extern gotoblas_t gotoblas_PENRYN; | |||
| #else | |||
| #define gotoblas_PENRYN gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_DUNNINGTON | |||
| extern gotoblas_t gotoblas_DUNNINGTON; | |||
| #else | |||
| #define gotoblas_DUNNINGTON gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_OPTERON | |||
| extern gotoblas_t gotoblas_OPTERON; | |||
| #else | |||
| #define gotoblas_OPTERON gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_OPTERON_SSE3 | |||
| extern gotoblas_t gotoblas_OPTERON_SSE3; | |||
| #else | |||
| #define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_BOBCAT | |||
| extern gotoblas_t gotoblas_BOBCAT; | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_BOBCAT gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_BOBCAT gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_SANDYBRIDGE | |||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_BULLDOZER | |||
| extern gotoblas_t gotoblas_BULLDOZER; | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_BULLDOZER gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_BULLDOZER gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_PILEDRIVER | |||
| extern gotoblas_t gotoblas_PILEDRIVER; | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_PILEDRIVER gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_PILEDRIVER gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_STEAMROLLER | |||
| extern gotoblas_t gotoblas_STEAMROLLER; | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_STEAMROLLER gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_STEAMROLLER gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_EXCAVATOR | |||
| extern gotoblas_t gotoblas_EXCAVATOR; | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_EXCAVATOR gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_EXCAVATOR gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_HASWELL | |||
| extern gotoblas_t gotoblas_HASWELL; | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_HASWELL gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_HASWELL gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_ZEN | |||
| extern gotoblas_t gotoblas_ZEN; | |||
| #elif defined(DYN_HASWELL) | |||
| #define gotoblas_ZEN gotoblas_HASWELL | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_ZEN gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_ZEN gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_ZEN gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_SKYLAKEX | |||
| extern gotoblas_t gotoblas_SKYLAKEX; | |||
| #elif defined(DYN_HASWELL) | |||
| #define gotoblas_SKYLAKEX gotoblas_HASWELL | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_SKYLAKEX gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_SKYLAKEX gotoblas_PRESCOTT | |||
| #endif | |||
| #else // not DYNAMIC_LIST | |||
| EXTERN gotoblas_t gotoblas_KATMAI; | |||
| EXTERN gotoblas_t gotoblas_COPPERMINE; | |||
| EXTERN gotoblas_t gotoblas_NORTHWOOD; | |||
| @@ -56,16 +217,27 @@ EXTERN gotoblas_t gotoblas_BANIAS; | |||
| EXTERN gotoblas_t gotoblas_ATHLON; | |||
| extern gotoblas_t gotoblas_PRESCOTT; | |||
| extern gotoblas_t gotoblas_CORE2; | |||
| extern gotoblas_t gotoblas_NEHALEM; | |||
| extern gotoblas_t gotoblas_BARCELONA; | |||
| #ifdef DYNAMIC_OLDER | |||
| extern gotoblas_t gotoblas_ATOM; | |||
| extern gotoblas_t gotoblas_NANO; | |||
| extern gotoblas_t gotoblas_CORE2; | |||
| extern gotoblas_t gotoblas_PENRYN; | |||
| extern gotoblas_t gotoblas_DUNNINGTON; | |||
| extern gotoblas_t gotoblas_NEHALEM; | |||
| extern gotoblas_t gotoblas_OPTERON; | |||
| extern gotoblas_t gotoblas_OPTERON_SSE3; | |||
| extern gotoblas_t gotoblas_BARCELONA; | |||
| extern gotoblas_t gotoblas_BOBCAT; | |||
| #else | |||
| #define gotoblas_ATOM gotoblas_NEHALEM | |||
| #define gotoblas_NANO gotoblas_NEHALEM | |||
| #define gotoblas_PENRYN gotoblas_CORE2 | |||
| #define gotoblas_DUNNINGTON gotoblas_CORE2 | |||
| #define gotoblas_OPTERON gotoblas_CORE2 | |||
| #define gotoblas_OPTERON_SSE3 gotoblas_CORE2 | |||
| #define gotoblas_BOBCAT gotoblas_CORE2 | |||
| #endif | |||
| #ifndef NO_AVX | |||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
| extern gotoblas_t gotoblas_BULLDOZER; | |||
| @@ -74,15 +246,22 @@ extern gotoblas_t gotoblas_STEAMROLLER; | |||
| extern gotoblas_t gotoblas_EXCAVATOR; | |||
| #ifdef NO_AVX2 | |||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | |||
| #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE | |||
| #define gotoblas_ZEN gotoblas_SANDYBRIDGE | |||
| #else | |||
| extern gotoblas_t gotoblas_HASWELL; | |||
| extern gotoblas_t gotoblas_ZEN; | |||
| #ifndef NO_AVX512 | |||
| extern gotoblas_t gotoblas_SKYLAKEX; | |||
| #else | |||
| #define gotoblas_SKYLAKEX gotoblas_HASWELL | |||
| #endif | |||
| #endif | |||
| #else | |||
| //Use NEHALEM kernels for sandy bridge | |||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||
| #define gotoblas_HASWELL gotoblas_NEHALEM | |||
| #define gotoblas_SKYLAKEX gotoblas_NEHALEM | |||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | |||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | |||
| #define gotoblas_STEAMROLLER gotoblas_BARCELONA | |||
| @@ -90,6 +269,7 @@ extern gotoblas_t gotoblas_ZEN; | |||
| #define gotoblas_ZEN gotoblas_BARCELONA | |||
| #endif | |||
| #endif // DYNAMIC_LIST | |||
| #define VENDOR_INTEL 1 | |||
| #define VENDOR_AMD 2 | |||
| @@ -284,8 +464,21 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| if (model == 5) { | |||
| // Intel Skylake X | |||
| #ifndef NO_AVX512 | |||
| return &gotoblas_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| return &gotoblas_HASWELL; | |||
| else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| #endif | |||
| } | |||
| //Intel Skylake | |||
| if (model == 14 || model == 5) { | |||
| if (model == 14) { | |||
| if(support_avx()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| @@ -307,6 +500,23 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| return NULL; | |||
| case 6: | |||
| if (model == 6) { | |||
| // Cannon Lake | |||
| #ifndef NO_AVX512 | |||
| return &gotoblas_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return &gotoblas_HASWELL; | |||
| #else | |||
| return &gotblas_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return &gotoblas_NEHALEM; | |||
| #endif | |||
| } | |||
| return NULL; | |||
| case 9: | |||
| case 8: | |||
| if (model == 14 ) { // Kaby Lake | |||
| @@ -445,7 +655,8 @@ static char *corename[] = { | |||
| "Haswell", | |||
| "Steamroller", | |||
| "Excavator", | |||
| "Zen" | |||
| "Zen", | |||
| "SkylakeX" | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| @@ -473,7 +684,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; | |||
| if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; | |||
| if (gotoblas == &gotoblas_ZEN) return corename[23]; | |||
| if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; | |||
| return corename[0]; | |||
| } | |||
| @@ -485,7 +696,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||
| char message[128]; | |||
| //char mname[20]; | |||
| for ( i=1 ; i <= 23; i++) | |||
| for ( i=1 ; i <= 24; i++) | |||
| { | |||
| if (!strncasecmp(coretype,corename[i],20)) | |||
| { | |||
| @@ -503,6 +714,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||
| switch (found) | |||
| { | |||
| case 24: return (&gotoblas_SKYLAKEX); | |||
| case 23: return (&gotoblas_ZEN); | |||
| case 22: return (&gotoblas_EXCAVATOR); | |||
| case 21: return (&gotoblas_STEAMROLLER); | |||
| @@ -167,7 +167,7 @@ int get_L2_size(void){ | |||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) | |||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |||
| @@ -251,7 +251,7 @@ int get_L2_size(void){ | |||
| void blas_set_parameter(void){ | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) | |||
| int size = 16; | |||
| #else | |||
| int size = get_L2_size(); | |||
| @@ -128,6 +128,8 @@ so : ../$(LIBSONAME) | |||
| ifeq ($(OSNAME), Android) | |||
| INTERNALNAME = $(LIBPREFIX).so | |||
| FEXTRALIB += -lm | |||
| EXTRALIB += -lm | |||
| else | |||
| INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| @@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "HASWELL" | |||
| #endif | |||
| #ifdef FORCE_SKYLAKEX | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "SKYLAKEX" | |||
| #define ARCHCONFIG "-DSKYLAKEX " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" | |||
| #define LIBNAME "skylakex" | |||
| #define CORENAME "SKYLAKEX" | |||
| #endif | |||
| #ifdef FORCE_ATOM | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| @@ -1181,9 +1196,7 @@ int main(int argc, char *argv[]){ | |||
| #elif NO_PARALLEL_MAKE==1 | |||
| printf("MAKE += -j 1\n"); | |||
| #else | |||
| #ifndef OS_WINDOWS | |||
| printf("MAKE += -j %d\n", get_num_cores()); | |||
| #endif | |||
| #endif | |||
| break; | |||
| @@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ | |||
| idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) | |||
| CSBLAS1OBJS = \ | |||
| cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | |||
| cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | |||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | |||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | |||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) | |||
| @@ -277,7 +277,7 @@ CSBLAS3OBJS = \ | |||
| cblas_sgeadd.$(SUFFIX) | |||
| CDBLAS1OBJS = \ | |||
| cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | |||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | |||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) | |||
| @@ -294,7 +294,7 @@ CDBLAS3OBJS += \ | |||
| cblas_dgeadd.$(SUFFIX) | |||
| CCBLAS1OBJS = \ | |||
| cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ | |||
| cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ | |||
| cblas_ccopy.$(SUFFIX) \ | |||
| cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ | |||
| cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | |||
| @@ -320,7 +320,7 @@ CCBLAS3OBJS = \ | |||
| CZBLAS1OBJS = \ | |||
| cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||
| cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||
| cblas_zcopy.$(SUFFIX) \ | |||
| cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | |||
| cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | |||
| @@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c | |||
| cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| @@ -40,11 +40,11 @@ | |||
| #include "common.h" | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #endif | |||
| #if defined(Z13) | |||
| #define MULTI_THREAD_MINIMAL 200000 | |||
| #else | |||
| #define MULTI_THREAD_MINIMAL 10000 | |||
| #define MULTI_THREAD_MINIMAL 10000 | |||
| #endif | |||
| #ifndef CBLAS | |||
| @@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||
| if (incy < 0) y -= (n - 1) * incy; | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| //disable multi-thread when incx==0 or incy==0 | |||
| //In that case, the threads would be dependent. | |||
| if (incx == 0 || incy == 0) | |||
| nthreads = 1; | |||
| // | |||
| //Temporarily work-around the low performance issue with small imput size & | |||
| //multithreads. | |||
| if (n <= MULTI_THREAD_MINIMAL) | |||
| if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -44,6 +44,7 @@ | |||
| #endif | |||
| #ifndef COMPLEX | |||
| #define SMP_THRESHOLD_MIN 65536.0 | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "QGEMM " | |||
| #elif defined(DOUBLE) | |||
| @@ -52,6 +53,7 @@ | |||
| #define ERROR_NAME "SGEMM " | |||
| #endif | |||
| #else | |||
| #define SMP_THRESHOLD_MIN 8192.0 | |||
| #ifndef GEMM3M | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "XGEMM " | |||
| @@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB, | |||
| FLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| int nthreads_max; | |||
| int nthreads_avail; | |||
| double MNK; | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| @@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| XFLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| int nthreads_max; | |||
| int nthreads_avail; | |||
| double MNK; | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| @@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| mode |= (transa << BLAS_TRANSA_SHIFT); | |||
| mode |= (transb << BLAS_TRANSB_SHIFT); | |||
| nthreads_max = num_cpu_avail(3); | |||
| nthreads_avail = nthreads_max; | |||
| #ifndef COMPLEX | |||
| MNK = (double) args.m * (double) args.n * (double) args.k; | |||
| if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| nthreads_max = 1; | |||
| #else | |||
| MNK = (double) args.m * (double) args.n * (double) args.k; | |||
| if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| nthreads_max = 1; | |||
| #endif | |||
| args.common = NULL; | |||
| if ( nthreads_max > nthreads_avail ) | |||
| args.nthreads = nthreads_avail; | |||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = nthreads_max; | |||
| args.nthreads = num_cpu_avail(3); | |||
| args.common = NULL; | |||
| if (args.nthreads == 1) { | |||
| #endif | |||
| @@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| if (n <= 1048576 ) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order, | |||
| mode |= (trans << BLAS_TRANSA_SHIFT); | |||
| mode |= (side << BLAS_RSIDE_SHIFT); | |||
| args.nthreads = num_cpu_avail(3); | |||
| if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||
| args.nthreads = 1; | |||
| else | |||
| if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(3); | |||
| if (args.nthreads == 1) { | |||
| @@ -41,7 +41,11 @@ | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #if defined(Z13) | |||
| #define MULTI_THREAD_MINIMAL 200000 | |||
| #else | |||
| #define MULTI_THREAD_MINIMAL 10000 | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | |||
| @@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||
| #endif | |||
| #ifndef CBLAS | |||
| PRINT_DEBUG_CNAME; | |||
| PRINT_DEBUG_NAME; | |||
| #else | |||
| PRINT_DEBUG_CNAME; | |||
| #endif | |||
| @@ -86,12 +90,15 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||
| if (incy < 0) y -= (n - 1) * incy * 2; | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| //disable multi-thread when incx==0 or incy==0 | |||
| //In that case, the threads would be dependent. | |||
| if (incx == 0 || incy == 0) | |||
| // | |||
| //Temporarily work-around the low performance issue with small imput size & | |||
| //multithreads. | |||
| if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||
| FUNCTION_PROFILE_START(); | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| if ( n <= 1048576 ) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy; | |||
| if (incy < 0) y -= (n - 1) * incy * 2; | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| //disable multi-thread when incx==0 or incy==0 | |||
| //In that case, the threads would be dependent. | |||
| if (incx == 0 || incy == 0) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| # Makefile.L3 | |||
| set(USE_TRMM false) | |||
| if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen") | |||
| if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex") | |||
| set(USE_TRMM true) | |||
| endif () | |||
| @@ -32,6 +32,10 @@ ifeq ($(CORE), HASWELL) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), SKYLAKEX) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), ZEN) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble cdot_kernel_L999 | |||
| cmp INC_X, #0 | |||
| beq cdot_kernel_L999 | |||
| # cmp INC_X, #0 | |||
| # beq cdot_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq cdot_kernel_L999 | |||
| # cmp INC_Y, #0 | |||
| # beq cdot_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne cdot_kernel_S_BEGIN | |||
| @@ -164,11 +164,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble ddot_kernel_L999 | |||
| cmp INC_X, #0 | |||
| beq ddot_kernel_L999 | |||
| # cmp INC_X, #0 | |||
| # beq ddot_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq ddot_kernel_L999 | |||
| # cmp INC_Y, #0 | |||
| # beq ddot_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne ddot_kernel_S_BEGIN | |||
| @@ -253,11 +253,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble sdot_kernel_L999 | |||
| cmp INC_X, #0 | |||
| beq sdot_kernel_L999 | |||
| # cmp INC_X, #0 | |||
| # beq sdot_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq sdot_kernel_L999 | |||
| # cmp INC_Y, #0 | |||
| # beq sdot_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne sdot_kernel_S_BEGIN | |||
| @@ -218,11 +218,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble zdot_kernel_L999 | |||
| cmp INC_X, #0 | |||
| beq zdot_kernel_L999 | |||
| # cmp INC_X, #0 | |||
| # beq zdot_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq zdot_kernel_L999 | |||
| # cmp INC_Y, #0 | |||
| # beq zdot_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne zdot_kernel_S_BEGIN | |||
| @@ -233,13 +233,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT asum = 0.0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (inc_x == 0) | |||
| nthreads = 1; | |||
| if (n <= 10000) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| asum = casum_compute(n, x, inc_x); | |||
| @@ -183,13 +183,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| if (n <= 0) return 0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (inc_x == 0) | |||
| nthreads = 1; | |||
| if (n <= 10000) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| do_copy(n, x, inc_x, y, inc_y); | |||
| @@ -228,13 +228,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT asum = 0.0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (inc_x == 0) | |||
| nthreads = 1; | |||
| if (n <= 10000) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| asum = dasum_compute(n, x, inc_x); | |||
| @@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| " faddp "DOTF", v0.2d \n" | |||
| #endif /* !defined(DSDOT) */ | |||
| #else /* !defined(DOUBLE) */ | |||
| #else /* !defined(DOUBLE) */ | |||
| #define KERNEL_F1 \ | |||
| " ldr "TMPX", ["X"] \n" \ | |||
| " ldr "TMPY", ["Y"] \n" \ | |||
| @@ -384,13 +384,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||
| RETURN_TYPE dot = 0.0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (inc_x == 0 || inc_y == 0) | |||
| nthreads = 1; | |||
| if (n <= 10000) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| dot = dot_compute(n, x, inc_x, y, inc_y); | |||
| @@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if (n <= 0 || inc_x <= 0) return 0.0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| nrm2_compute(n, x, inc_x, &ssq, &scale); | |||
| @@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if (n <= 0 || inc_x <= 0) return 0.0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| nrm2 = nrm2_compute(n, x, inc_x); | |||
| @@ -321,13 +321,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| BLASLONG max_index = 0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (inc_x == 0) | |||
| nthreads = 1; | |||
| if (n <= 10000) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| max_index = iamax_compute(n, x, inc_x); | |||
| @@ -330,13 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| BLASLONG max_index = 0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (inc_x == 0) | |||
| nthreads = 1; | |||
| if (n <= 10000) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| max_index = izamax_compute(n, x, inc_x); | |||
| @@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT asum = 0.0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (inc_x == 0) | |||
| nthreads = 1; | |||
| if (n <= 10000) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| asum = sasum_compute(n, x, inc_x); | |||
| @@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if (n <= 0 || inc_x <= 0) return 0.0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| nrm2_double = nrm2_compute(n, x, inc_x); | |||
| @@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT asum = 0.0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (inc_x == 0) | |||
| nthreads = 1; | |||
| if (n <= 10000) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| asum = zasum_compute(n, x, inc_x); | |||
| @@ -317,13 +317,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| CIMAG(zdot) = 0.0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (inc_x == 0 || inc_y == 0) | |||
| nthreads = 1; | |||
| if (n <= 10000) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| zdot_compute(n, x, inc_x, y, inc_y, &zdot); | |||
| @@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c | |||
| # | |||
| SROTKERNEL = srot.c | |||
| DROTKERNEL = drot.c | |||
| #CROTKERNEL = ../arm/zrot.c | |||
| CROTKERNEL = zrot.c | |||
| ZROTKERNEL = zrot.c | |||
| # | |||
| SSCALKERNEL = sscal.c | |||
| @@ -647,7 +647,9 @@ static int get_l2_size_old(void){ | |||
| return 6144; | |||
| } | |||
| } | |||
| return 0; | |||
| // return 0; | |||
| fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n"); | |||
| return 256; | |||
| } | |||
| #endif | |||
| @@ -660,6 +662,10 @@ static __inline__ int get_l2_size(void){ | |||
| l2 = BITMASK(ecx, 16, 0xffff); | |||
| #ifndef ARCH_X86 | |||
| if (l2 <= 0) { | |||
| fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n"); | |||
| return 256; | |||
| } | |||
| return l2; | |||
| #else | |||
| @@ -871,6 +877,22 @@ static void init_parameter(void) { | |||
| #endif | |||
| #endif | |||
| #ifdef SKYLAKEX | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "SkylakeX\n"); | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef EXPRECISION | |||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
| #endif | |||
| #endif | |||
| #ifdef OPTERON | |||
| @@ -1,3 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.PENRYN | |||
| SSWAPKERNEL = ../arm/swap.c | |||
| DSWAPKERNEL = ../arm/swap.c | |||
| @@ -138,6 +138,14 @@ | |||
| /* INCX != 1 or INCY != 1 */ | |||
| .L14: | |||
| cmpl $0, %ebx | |||
| jne .L141 | |||
| cmpl $0, %ecx | |||
| jne .L141 | |||
| /* INCX == 0 and INCY == 0 */ | |||
| jmp .L27 | |||
| .L141: | |||
| movl %edx, %eax | |||
| sarl $2, %eax | |||
| jle .L28 | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -61,7 +61,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -63,7 +63,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -61,7 +61,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -63,7 +63,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -61,7 +61,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -0,0 +1,19 @@ | |||
| include $(KERNELDIR)/KERNEL.HASWELL | |||
| SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S | |||
| #DTRMMKERNEL = ../generic/trmmkernel_16x2.c | |||
| #DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S | |||
| #DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| #DGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| #DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| #DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| #DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| #DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| #DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| #DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMM_BETA = ../generic/gemm_beta.c | |||
| DGEMM_BETA = ../generic/gemm_beta.c | |||
| @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "caxpy_microk_steamroller-2.c" | |||
| #elif defined(BULLDOZER) | |||
| #include "caxpy_microk_bulldozer-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) | |||
| #include "caxpy_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "caxpy_microk_sandy-2.c" | |||
| @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "cdot_microk_bulldozer-2.c" | |||
| #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) | |||
| #include "cdot_microk_steamroller-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "cdot_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "cdot_microk_sandy-2.c" | |||
| @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #if defined(HASWELL) || defined(ZEN) | |||
| #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "cgemv_n_microk_haswell-4.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "cgemv_n_microk_bulldozer-4.c" | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(HASWELL) || defined(ZEN) | |||
| #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "cgemv_t_microk_haswell-4.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "cgemv_t_microk_bulldozer-4.c" | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(HASWELL) || defined(ZEN) | |||
| #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "cscal_microk_haswell-2.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #include "cscal_microk_bulldozer-2.c" | |||
| @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "daxpy_microk_steamroller-2.c" | |||
| #elif defined(PILEDRIVER) | |||
| #include "daxpy_microk_piledriver-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "daxpy_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "daxpy_microk_sandy-2.c" | |||
| @@ -29,15 +29,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) | |||
| #if defined(BULLDOZER) | |||
| #include "ddot_microk_bulldozer-2.c" | |||
| #elif defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "ddot_microk_steamroller-2.c" | |||
| #elif defined(PILEDRIVER) | |||
| #include "ddot_microk_piledriver-2.c" | |||
| #elif defined(NEHALEM) | |||
| #elif defined(NEHALEM) | |||
| #include "ddot_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "ddot_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "ddot_microk_sandy-2.c" | |||
| @@ -110,7 +110,7 @@ static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| BLASLONG n1 = n & -4; | |||
| BLASLONG n1 = n & -4; | |||
| while(i < n1) | |||
| { | |||
| @@ -169,13 +169,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT dot = 0.0; | |||
| #if defined(SMP) | |||
| nthreads = num_cpu_avail(1); | |||
| if (inc_x == 0 || inc_y == 0) | |||
| nthreads = 1; | |||
| if (n <= 10000) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| dot = dot_compute(n, x, inc_x, y, inc_y); | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NEHALEM) | |||
| #include "dgemv_n_microk_nehalem-4.c" | |||
| #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) | |||
| #include "dgemv_n_microk_haswell-4.c" | |||
| #endif | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) | |||
| #include "dgemv_t_microk_haswell-4.c" | |||
| #endif | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "dscal_microk_bulldozer-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "dscal_microk_sandy-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "dscal_microk_haswell-2.c" | |||
| #endif | |||
| @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "dsymv_L_microk_bulldozer-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "dsymv_L_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "dsymv_L_microk_sandy-2.c" | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "dsymv_U_microk_bulldozer-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "dsymv_U_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "dsymv_U_microk_sandy-2.c" | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NEHALEM) | |||
| #include "saxpy_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "saxpy_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "saxpy_microk_sandy-2.c" | |||
| @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "sdot_microk_steamroller-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "sdot_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "sdot_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "sdot_microk_sandy-2.c" | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "sgemv_n_microk_nehalem-4.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "sgemv_n_microk_sandy-4.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "sgemv_n_microk_haswell-4.c" | |||
| #endif | |||
| @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "sgemv_t_microk_bulldozer-4.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "sgemv_t_microk_sandy-4.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "sgemv_t_microk_haswell-4.c" | |||
| #endif | |||
| @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "ssymv_L_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "ssymv_L_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "ssymv_L_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "ssymv_L_microk_sandy-2.c" | |||
| @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "ssymv_U_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "ssymv_U_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "ssymv_U_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "ssymv_U_microk_sandy-2.c" | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 12) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 12) | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 12) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 12) | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 12) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 12) | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 12) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 24) | |||
| @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "zaxpy_microk_bulldozer-2.c" | |||
| #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "zaxpy_microk_steamroller-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "zaxpy_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "zaxpy_microk_sandy-2.c" | |||
| @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "zdot_microk_bulldozer-2.c" | |||
| #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) | |||
| #include "zdot_microk_steamroller-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "zdot_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "zdot_microk_sandy-2.c" | |||
| @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(HASWELL) || defined(ZEN) | |||
| #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "zgemv_n_microk_haswell-4.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "zgemv_n_microk_sandy-4.c" | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "zgemv_t_microk_bulldozer-4.c" | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "zgemv_t_microk_haswell-4.c" | |||
| #endif | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(HASWELL) || defined(ZEN) | |||
| #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #include "zscal_microk_haswell-2.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #include "zscal_microk_bulldozer-2.c" | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 24) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 24) | |||