Merge develop for bugfix release 0.3.17tags/v0.3.17
@@ -1,5 +1,20 @@ | |||||
OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
==================================================================== | ==================================================================== | ||||
Version 0.3.17 | |||||
15-Jul-2021 | |||||
common: | |||||
- reverted the optimization of SGEMV_N/DGEMV_N for small input sizes | |||||
and consecutive arguments as it led to stack overflows on x86_64 | |||||
with some operating systems (notably OSX and Windows) | |||||
x86_64: | |||||
- reverted the performance patch for SGEMV_T on AVX512 as it caused | |||||
wrong results in some applications | |||||
SPARC: | |||||
- fixed compilation with compilers other than gcc | |||||
==================================================================== | |||||
Version 0.3.16 | Version 0.3.16 | ||||
11-Jul-2021 | 11-Jul-2021 | ||||
@@ -3,7 +3,7 @@ | |||||
# | # | ||||
# This library's version | # This library's version | ||||
VERSION = 0.3.16 | |||||
VERSION = 0.3.16.dev | |||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
@@ -54,6 +54,7 @@ | |||||
#define VENDOR_TRANSMETA 9 | #define VENDOR_TRANSMETA 9 | ||||
#define VENDOR_NSC 10 | #define VENDOR_NSC 10 | ||||
#define VENDOR_HYGON 11 | #define VENDOR_HYGON 11 | ||||
#define VENDOR_ZHAOXIN 12 | |||||
#define VENDOR_UNKNOWN 99 | #define VENDOR_UNKNOWN 99 | ||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | ||||
@@ -283,7 +283,7 @@ int get_vendor(void){ | |||||
if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; | if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; | ||||
if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; | if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; | ||||
if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; | if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; | ||||
if (!strcmp(vendor, " Shanghai ")) return VENDOR_CENTAUR; | |||||
if (!strcmp(vendor, " Shanghai ")) return VENDOR_ZHAOXIN; | |||||
if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; | if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; | ||||
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; | if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; | ||||
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; | if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; | ||||
@@ -1067,7 +1067,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||||
if ((get_vendor() == VENDOR_AMD) || | if ((get_vendor() == VENDOR_AMD) || | ||||
(get_vendor() == VENDOR_HYGON) || | (get_vendor() == VENDOR_HYGON) || | ||||
(get_vendor() == VENDOR_CENTAUR)) { | |||||
(get_vendor() == VENDOR_CENTAUR) || | |||||
(get_vendor() == VENDOR_ZHAOXIN)) { | |||||
cpuid(0x80000005, &eax, &ebx, &ecx, &edx); | cpuid(0x80000005, &eax, &ebx, &ecx, &edx); | ||||
LDTB.size = 4096; | LDTB.size = 4096; | ||||
@@ -1190,7 +1191,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||||
int get_cpuname(void){ | int get_cpuname(void){ | ||||
int family, exfamily, model, vendor, exmodel; | |||||
int family, exfamily, model, vendor, exmodel, stepping; | |||||
if (!have_cpuid()) return CPUTYPE_80386; | if (!have_cpuid()) return CPUTYPE_80386; | ||||
@@ -1198,6 +1199,7 @@ int get_cpuname(void){ | |||||
exfamily = get_cputype(GET_EXFAMILY); | exfamily = get_cputype(GET_EXFAMILY); | ||||
model = get_cputype(GET_MODEL); | model = get_cputype(GET_MODEL); | ||||
exmodel = get_cputype(GET_EXMODEL); | exmodel = get_cputype(GET_EXMODEL); | ||||
stepping = get_cputype(GET_STEPPING); | |||||
vendor = get_vendor(); | vendor = get_vendor(); | ||||
@@ -1628,15 +1630,20 @@ int get_cpuname(void){ | |||||
switch (family) { | switch (family) { | ||||
case 0x5: | case 0x5: | ||||
return CPUTYPE_CENTAURC6; | return CPUTYPE_CENTAURC6; | ||||
break; | |||||
case 0x6: | case 0x6: | ||||
return CPUTYPE_NANO; | |||||
break; | |||||
case 0x7: | |||||
if (model == 0xf && stepping < 0xe) | |||||
return CPUTYPE_NANO; | |||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
break; | |||||
default: | |||||
if (family >= 0x7) | |||||
return CPUTYPE_NEHALEM; | |||||
else | |||||
return CPUTYPE_VIAC3; | |||||
} | } | ||||
return CPUTYPE_VIAC3; | |||||
} | |||||
if (vendor == VENDOR_ZHAOXIN){ | |||||
return CPUTYPE_NEHALEM; | |||||
} | } | ||||
if (vendor == VENDOR_RISE){ | if (vendor == VENDOR_RISE){ | ||||
@@ -1869,7 +1876,7 @@ char *get_lower_cpunamechar(void){ | |||||
int get_coretype(void){ | int get_coretype(void){ | ||||
int family, exfamily, model, exmodel, vendor; | |||||
int family, exfamily, model, exmodel, vendor, stepping; | |||||
if (!have_cpuid()) return CORE_80486; | if (!have_cpuid()) return CORE_80486; | ||||
@@ -1877,6 +1884,7 @@ int get_coretype(void){ | |||||
exfamily = get_cputype(GET_EXFAMILY); | exfamily = get_cputype(GET_EXFAMILY); | ||||
model = get_cputype(GET_MODEL); | model = get_cputype(GET_MODEL); | ||||
exmodel = get_cputype(GET_EXMODEL); | exmodel = get_cputype(GET_EXMODEL); | ||||
stepping = get_cputype(GET_STEPPING); | |||||
vendor = get_vendor(); | vendor = get_vendor(); | ||||
@@ -2286,13 +2294,19 @@ int get_coretype(void){ | |||||
if (vendor == VENDOR_CENTAUR) { | if (vendor == VENDOR_CENTAUR) { | ||||
switch (family) { | switch (family) { | ||||
case 0x6: | case 0x6: | ||||
return CORE_NANO; | |||||
break; | |||||
case 0x7: | |||||
if (model == 0xf && stepping < 0xe) | |||||
return CORE_NANO; | |||||
return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
break; | |||||
default: | |||||
if (family >= 0x7) | |||||
return CORE_NEHALEM; | |||||
else | |||||
return CORE_VIAC3; | |||||
} | } | ||||
return CORE_VIAC3; | |||||
} | |||||
if (vendor == VENDOR_ZHAOXIN) { | |||||
return CORE_NEHALEM; | |||||
} | } | ||||
return CORE_UNKNOWN; | return CORE_UNKNOWN; | ||||
@@ -292,6 +292,7 @@ extern gotoblas_t gotoblas_COOPERLAKE; | |||||
#define VENDOR_AMD 2 | #define VENDOR_AMD 2 | ||||
#define VENDOR_CENTAUR 3 | #define VENDOR_CENTAUR 3 | ||||
#define VENDOR_HYGON 4 | #define VENDOR_HYGON 4 | ||||
#define VENDOR_ZHAOXIN 5 | |||||
#define VENDOR_UNKNOWN 99 | #define VENDOR_UNKNOWN 99 | ||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | ||||
@@ -404,7 +405,7 @@ static int get_vendor(void){ | |||||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; | if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; | ||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; | if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; | ||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; | if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; | ||||
if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_CENTAUR; | |||||
if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_ZHAOXIN; | |||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; | if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; | ||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | ||||
@@ -415,7 +416,7 @@ static int get_vendor(void){ | |||||
static gotoblas_t *get_coretype(void){ | static gotoblas_t *get_coretype(void){ | ||||
int eax, ebx, ecx, edx; | int eax, ebx, ecx, edx; | ||||
int family, exfamily, model, vendor, exmodel; | |||||
int family, exfamily, model, vendor, exmodel, stepping; | |||||
cpuid(1, &eax, &ebx, &ecx, &edx); | cpuid(1, &eax, &ebx, &ecx, &edx); | ||||
@@ -423,6 +424,7 @@ static gotoblas_t *get_coretype(void){ | |||||
exfamily = BITMASK(eax, 20, 0xff); | exfamily = BITMASK(eax, 20, 0xff); | ||||
model = BITMASK(eax, 4, 0x0f); | model = BITMASK(eax, 4, 0x0f); | ||||
exmodel = BITMASK(eax, 16, 0x0f); | exmodel = BITMASK(eax, 16, 0x0f); | ||||
stepping = BITMASK(eax, 0, 0x0f); | |||||
vendor = get_vendor(); | vendor = get_vendor(); | ||||
@@ -824,13 +826,19 @@ static gotoblas_t *get_coretype(void){ | |||||
if (vendor == VENDOR_CENTAUR) { | if (vendor == VENDOR_CENTAUR) { | ||||
switch (family) { | switch (family) { | ||||
case 0x6: | case 0x6: | ||||
return &gotoblas_NANO; | |||||
break; | |||||
case 0x7: | |||||
if (model == 0xf && stepping < 0xe) | |||||
return &gotoblas_NANO; | |||||
return &gotoblas_NEHALEM; | return &gotoblas_NEHALEM; | ||||
default: | |||||
if (family >= 0x7) | |||||
return &gotoblas_NEHALEM; | |||||
} | } | ||||
} | } | ||||
if (vendor == VENDOR_ZHAOXIN) { | |||||
return &gotoblas_NEHALEM; | |||||
} | |||||
return NULL; | return NULL; | ||||
} | } | ||||
@@ -201,12 +201,14 @@ void CNAME(enum CBLAS_ORDER order, | |||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | ||||
if (alpha == ZERO) return; | if (alpha == ZERO) return; | ||||
#if 0 | |||||
/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */ | |||||
if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { | if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { | ||||
GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); | GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); | ||||
return; | return; | ||||
} | } | ||||
#endif | |||||
IDEBUG_START; | IDEBUG_START; | ||||
FUNCTION_PROFILE_START(); | FUNCTION_PROFILE_START(); | ||||
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "sgemv_t_microk_haswell-4.c" | #include "sgemv_t_microk_haswell-4.c" | ||||
#elif defined (SKYLAKEX) || defined (COOPERLAKE) | #elif defined (SKYLAKEX) || defined (COOPERLAKE) | ||||
#include "sgemv_t_microk_haswell-4.c" | #include "sgemv_t_microk_haswell-4.c" | ||||
#include "sgemv_t_microk_skylakex.c" | |||||
/*#include "sgemv_t_microk_skylakex.c"*/ | |||||
#endif | #endif | ||||
#if defined(STEAMROLLER) || defined(EXCAVATOR) | #if defined(STEAMROLLER) || defined(EXCAVATOR) | ||||
@@ -99,6 +99,8 @@ typedef int blasint; | |||||
/* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ | /* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ | ||||
#ifdef OPENBLAS_OS_LINUX | #ifdef OPENBLAS_OS_LINUX | ||||
#define _GNU_SOURCE | |||||
#ifndef _GNU_SOURCE | |||||
#define _GNU_SOURCE | |||||
#endif | |||||
#include <sched.h> | #include <sched.h> | ||||
#endif | #endif |
@@ -2502,7 +2502,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
#define GEMM_DEFAULT_OFFSET_B 2048 | #define GEMM_DEFAULT_OFFSET_B 2048 | ||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
#define SGEMM_DEFAULT_UNROLL_M 2 | #define SGEMM_DEFAULT_UNROLL_M 2 | ||||
#define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
@@ -2534,7 +2534,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
#define GEMM_DEFAULT_OFFSET_B 2048 | #define GEMM_DEFAULT_OFFSET_B 2048 | ||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
#define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
#define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||