@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||||
project(OpenBLAS C ASM) | project(OpenBLAS C ASM) | ||||
set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
set(OpenBLAS_MINOR_VERSION 3) | set(OpenBLAS_MINOR_VERSION 3) | ||||
set(OpenBLAS_PATCH_VERSION 4.dev) | |||||
set(OpenBLAS_PATCH_VERSION 5.dev) | |||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
# Adhere to GNU filesystem layout conventions | # Adhere to GNU filesystem layout conventions | ||||
@@ -1,4 +1,77 @@ | |||||
OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
==================================================================== | |||||
Version 0.3.4 | |||||
02-Dec-2018 | |||||
common: | |||||
* the new, experimental thread-local memory allocation had | |||||
inadvertently been left enabled for gmake builds in 0.3.3 | |||||
despite the announcement. It is now disabled by default, and | |||||
single-threaded builds will keep using the old allocator even | |||||
if the USE_TLS option is turned on. | |||||
* OpenBLAS will now provide enough buffer space for at least 50 | |||||
threads by default. | |||||
* The output of openblas_get_config() now contains the version | |||||
number. | |||||
* A serious thread safety bug in GEMV operation with small M and | |||||
large N size has been fixed. | |||||
* The code will now automatically call blas_thread_init after a | |||||
fork if needed before handling a call to openblas_set_num_threads | |||||
* Accesses to parallelized level3 functions from multiple callers | |||||
are now serialized to avoid thread races (unless using OpenMP). | |||||
This should provide better performance than the known-threadsafe | |||||
(but non-default) USE_SIMPLE_THREADED_LEVEL3 option. | |||||
* When building LAPACK with gfortran, -frecursive is now (again) | |||||
enabled by default to ensure correct behaviour. | |||||
* The OpenBLAS version cblas.h now supports both CBLAS_ORDER and | |||||
CBLAS_LAYOUT as the name of the matrix row/column order option. | |||||
* Externally set LDFLAGS are now passed through to the final compile/link | |||||
steps to facilitate setting platform-specific linker flags. | |||||
* A potential race condition during the build of LAPACK (that would | |||||
usually manifest itself as a failure to build TESTING/MATGEN) has been | |||||
fixed. | |||||
* xHEMV has been changed to stay single-threaded for small input sizes | |||||
where the overhead of multithreading exceeds any possible gains | |||||
* CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or | |||||
ThunderX hardware with sizable input. | |||||
* Linker flags for the PGI compiler have been updated | |||||
* Behaviour of AXPY with zero increments is now handled in the C interface, | |||||
correcting the result on at least Intel Atom. | |||||
* The result matrix from calling SGELSS with an all-zero input matrix is | |||||
now zeroed completely. | |||||
x86_64: | |||||
* Autodetection of AMD Ryzen2 has been fixed (again). | |||||
* CMAKE builds now support labeling of an INTERFACE64=1 build of | |||||
the library with the _64 suffix. | |||||
* AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel | |||||
has been sped up by rewriting with C intrinsics | |||||
* Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS) | |||||
POWER: | |||||
* added support for building on AIX (with gcc and GNU tools from AIX Toolbox). | |||||
* CPU type detection has been implemented for AIX. | |||||
* CPU type detection has been fixed for NETBSD. | |||||
MIPS64: | |||||
* AXPY on LOONGSON3A has been corrected to pass "zero increment" utest. | |||||
* DSDOT on LOONGSON3A has been fixed. | |||||
* the SGEMM microkernel has been hardened against potential data loss. | |||||
ARMV8: | |||||
* DYNAMic_ARCH support is now available for 64bit ARM | |||||
* cross-compiling for ARMV8 under iOS now works. | |||||
* cpu-specific code has been rearranged to make better use of both | |||||
hardware commonalities and model-specific compiler optimizations. | |||||
* XGENE1 has been removed as a TARGET, superseded by the improved generic | |||||
ARMV8 support. | |||||
ARMV7: | |||||
* Older assembly mnemonics have been converted to UAL form to allow | |||||
building with clang 7.0 | |||||
* Cross compiling LAPACKE for Android has been fixed again (broken by | |||||
update to LAPACK 3.7.0 some while ago). | |||||
==================================================================== | ==================================================================== | ||||
Version 0.3.3 | Version 0.3.3 | ||||
31-Aug-2018 | 31-Aug-2018 | ||||
@@ -3,7 +3,7 @@ | |||||
# | # | ||||
# This library's version | # This library's version | ||||
VERSION = 0.3.4.dev | |||||
VERSION = 0.3.5.dev | |||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
@@ -18,7 +18,7 @@ else ifeq ($(ARCH), i386) | |||||
override ARCH=x86 | override ARCH=x86 | ||||
else ifeq ($(ARCH), aarch64) | else ifeq ($(ARCH), aarch64) | ||||
override ARCH=arm64 | override ARCH=arm64 | ||||
endif | |||||
endif | |||||
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib | NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib | ||||
@@ -1042,6 +1042,8 @@ ifdef USE_TLS | |||||
CCOMMON_OPT += -DUSE_TLS | CCOMMON_OPT += -DUSE_TLS | ||||
endif | endif | ||||
CCOMMON_OPT += -DVERSION=\"$(VERSION)\" | |||||
ifndef SYMBOLPREFIX | ifndef SYMBOLPREFIX | ||||
SYMBOLPREFIX = | SYMBOLPREFIX = | ||||
endif | endif | ||||
@@ -310,6 +310,8 @@ if (MIXED_MEMORY_ALLOCATION) | |||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") | set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") | ||||
endif () | endif () | ||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"") | |||||
set(REVISION "-r${OpenBLAS_VERSION}") | set(REVISION "-r${OpenBLAS_VERSION}") | ||||
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) | set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) | ||||
@@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS") | |||||
set(HOST_OS WINNT) | set(HOST_OS WINNT) | ||||
endif () | endif () | ||||
if (${HOST_OS} STREQUAL "LINUX") | |||||
# check if we're building natively on Android (TERMUX) | |||||
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) | |||||
if(${OPERATING_SYSTEM} MATCHES "Android") | |||||
set(HOST_OS ANDROID) | |||||
endif(${OPERATING_SYSTEM} MATCHES "Android") | |||||
endif() | |||||
if(CMAKE_COMPILER_IS_GNUCC AND WIN32) | if(CMAKE_COMPILER_IS_GNUCC AND WIN32) | ||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine | execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine | ||||
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE | OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE | ||||
@@ -175,9 +175,9 @@ int detect(void){ | |||||
return CPUTYPE_PPC970; | return CPUTYPE_PPC970; | ||||
#endif | #endif | ||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) | |||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) | |||||
int id; | int id; | ||||
id = __asm __volatile("mfpvr %0" : "=r"(id)); | |||||
__asm __volatile("mfpvr %0" : "=r"(id)); | |||||
switch ( id >> 16 ) { | switch ( id >> 16 ) { | ||||
case 0x4e: // POWER9 | case 0x4e: // POWER9 | ||||
return CPUTYPE_POWER8; | return CPUTYPE_POWER8; | ||||
@@ -2586,7 +2586,7 @@ void *blas_memory_alloc(int procpos){ | |||||
printf("Alloc Start ...\n"); | printf("Alloc Start ...\n"); | ||||
#endif | #endif | ||||
#if defined(WHEREAMI) && !defined(USE_OPENMP) | |||||
/* #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||||
mypos = WhereAmI(); | mypos = WhereAmI(); | ||||
@@ -2596,12 +2596,12 @@ void *blas_memory_alloc(int procpos){ | |||||
do { | do { | ||||
if (!memory[position].used && (memory[position].pos == mypos)) { | if (!memory[position].used && (memory[position].pos == mypos)) { | ||||
LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
/* blas_lock(&memory[position].lock);*/ | |||||
// blas_lock(&memory[position].lock); | |||||
if (!memory[position].used) goto allocation; | if (!memory[position].used) goto allocation; | ||||
UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
/* blas_unlock(&memory[position].lock);*/ | |||||
// blas_unlock(&memory[position].lock); | |||||
} | } | ||||
position ++; | position ++; | ||||
@@ -2609,7 +2609,7 @@ void *blas_memory_alloc(int procpos){ | |||||
} while (position < NUM_BUFFERS); | } while (position < NUM_BUFFERS); | ||||
#endif | |||||
#endif */ | |||||
position = 0; | position = 0; | ||||
@@ -42,8 +42,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
static char* openblas_config_str="" | static char* openblas_config_str="" | ||||
"OpenBLAS " | |||||
VERSION | |||||
" " | |||||
#ifdef USE64BITINT | #ifdef USE64BITINT | ||||
"USE64BITINT " | |||||
" USE64BITINT " | |||||
#endif | #endif | ||||
#ifdef NO_CBLAS | #ifdef NO_CBLAS | ||||
"NO_CBLAS " | "NO_CBLAS " | ||||
@@ -1,12 +1,13 @@ | |||||
CAXPYKERNEL = ../mips/zaxpy.c | CAXPYKERNEL = ../mips/zaxpy.c | ||||
ZAXPYKERNEL = ../mips/zaxpy.c | ZAXPYKERNEL = ../mips/zaxpy.c | ||||
SROTKERNEL = ../mips/rot.c | |||||
DROTKERNEL = ../mips/rot.c | |||||
CROTKERNEL = ../mips/zrot.c | |||||
ZROTKERNEL = ../mips/zrot.c | |||||
SROTKERNEL = ../mips/rot.c | |||||
DROTKERNEL = ../mips/rot.c | |||||
CROTKERNEL = ../mips/zrot.c | |||||
ZROTKERNEL = ../mips/zrot.c | |||||
CSWAPKERNEL = ../mips/zswap.c | CSWAPKERNEL = ../mips/zswap.c | ||||
ZSWAPKERNEL = ../mips/zswap.c | ZSWAPKERNEL = ../mips/zswap.c | ||||
ifndef SNRM2KERNEL | ifndef SNRM2KERNEL | ||||
SNRM2KERNEL = snrm2.S | SNRM2KERNEL = snrm2.S | ||||
endif | endif | ||||
@@ -63,6 +63,7 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
DSDOTKERNEL = ../mips/dot.c | |||||
@@ -146,11 +146,11 @@ | |||||
sd $21, 40($sp) | sd $21, 40($sp) | ||||
sd $22, 48($sp) | sd $22, 48($sp) | ||||
ST $f24, 56($sp) | |||||
ST $f25, 64($sp) | |||||
ST $f26, 72($sp) | |||||
ST $f27, 80($sp) | |||||
ST $f28, 88($sp) | |||||
sdc1 $f24, 56($sp) | |||||
sdc1 $f25, 64($sp) | |||||
sdc1 $f26, 72($sp) | |||||
sdc1 $f27, 80($sp) | |||||
sdc1 $f28, 88($sp) | |||||
#if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
sd $23, 96($sp) | sd $23, 96($sp) | ||||
@@ -161,10 +161,10 @@ | |||||
#endif | #endif | ||||
#ifndef __64BIT__ | #ifndef __64BIT__ | ||||
ST $f20,120($sp) | |||||
ST $f21,128($sp) | |||||
ST $f22,136($sp) | |||||
ST $f23,144($sp) | |||||
sdc1 $f20,120($sp) | |||||
sdc1 $f21,128($sp) | |||||
sdc1 $f22,136($sp) | |||||
sdc1 $f23,144($sp) | |||||
#endif | #endif | ||||
.align 4 | .align 4 | ||||
@@ -7766,11 +7766,11 @@ | |||||
ld $21, 40($sp) | ld $21, 40($sp) | ||||
ld $22, 48($sp) | ld $22, 48($sp) | ||||
LD $f24, 56($sp) | |||||
LD $f25, 64($sp) | |||||
LD $f26, 72($sp) | |||||
LD $f27, 80($sp) | |||||
LD $f28, 88($sp) | |||||
ldc1 $f24, 56($sp) | |||||
ldc1 $f25, 64($sp) | |||||
ldc1 $f26, 72($sp) | |||||
ldc1 $f27, 80($sp) | |||||
ldc1 $f28, 88($sp) | |||||
#if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
ld $23, 96($sp) | ld $23, 96($sp) | ||||
@@ -7779,10 +7779,10 @@ | |||||
#endif | #endif | ||||
#ifndef __64BIT__ | #ifndef __64BIT__ | ||||
LD $f20,120($sp) | |||||
LD $f21,128($sp) | |||||
LD $f22,136($sp) | |||||
LD $f23,144($sp) | |||||
ldc1 $f20,120($sp) | |||||
ldc1 $f21,128($sp) | |||||
ldc1 $f22,136($sp) | |||||
ldc1 $f23,144($sp) | |||||
#endif | #endif | ||||
daddiu $sp,$sp,STACKSIZE | daddiu $sp,$sp,STACKSIZE | ||||
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, | |||||
} | } | ||||
if (n == 0 || m == 0) | if (n == 0 || m == 0) | ||||
return; | |||||
return 0; | |||||
c_offset = c; | c_offset = c; | ||||