Merge branch 'develop' into fbsd12

6 years ago · c5f8aeff2d
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
 set(OpenBLAS_PATCH_VERSION 4.dev)
 set(OpenBLAS_PATCH_VERSION 5.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

 # Adhere to GNU filesystem layout conventions
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,77 @@
 OpenBLAS ChangeLog
 ====================================================================
 Version 0.3.4
 02-Dec-2018

 common:
 	* the new, experimental thread-local memory allocation had 
 	  inadvertently been left enabled for gmake builds in 0.3.3
 	  despite the announcement. It is now disabled by default, and
 	  single-threaded builds will keep using the old allocator even
 	  if the USE_TLS option is turned on.
 	* OpenBLAS will now provide enough buffer space for at least 50
 	  threads by default.
 	* The output of openblas_get_config() now contains the version
 	  number.
 	* A serious thread safety bug in GEMV operation with small M and
 	  large N size has been fixed.
 	* The code will now automatically call blas_thread_init after a
 	  fork if needed before handling a call to openblas_set_num_threads
 	* Accesses to parallelized level3 functions from multiple callers
 	  are now serialized to avoid thread races (unless using OpenMP).
 	  This should provide better performance than the known-threadsafe
 	  (but non-default) USE_SIMPLE_THREADED_LEVEL3 option.
 	* When building LAPACK with gfortran, -frecursive is now (again)
 	  enabled by default to ensure correct behaviour.
        * The OpenBLAS version cblas.h now supports both CBLAS_ORDER and
 	  CBLAS_LAYOUT as the name of the matrix row/column order option.
 	* Externally set LDFLAGS are now passed through to the final compile/link
 	  steps to facilitate setting platform-specific linker flags.
 	* A potential race condition during the build of LAPACK (that would 
 	  usually manifest itself as a failure to build TESTING/MATGEN) has been 
 	  fixed.
 	* xHEMV has been changed to stay single-threaded for small input sizes
 	  where the overhead of multithreading exceeds any possible gains
 	* CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or
 	  ThunderX hardware with sizable input.
 	* Linker flags for the PGI compiler have been updated
 	* Behaviour of AXPY with zero increments is now handled in the C interface,
 	  correcting the result on at least Intel Atom.
 	* The result matrix from calling SGELSS with an all-zero input matrix is 
 	  now zeroed completely.
 	  
 x86_64:
 	* Autodetection of AMD Ryzen2 has been fixed (again).
        * CMAKE builds now support labeling of an INTERFACE64=1 build of
 	  the library with the _64 suffix.
 	* AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel
 	  has been sped up by rewriting with C intrinsics
 	* Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS)
 	
 POWER:
 	* added support for building on AIX (with gcc and GNU tools from AIX Toolbox).
 	* CPU type detection has been implemented for AIX.
 	* CPU type detection has been fixed for NETBSD.
 	
 MIPS64:
 	* AXPY on LOONGSON3A has been corrected to pass "zero increment" utest.
 	* DSDOT on LOONGSON3A has been fixed.
 	* the SGEMM microkernel has been hardened against potential data loss.
 	
 ARMV8:
 	* DYNAMic_ARCH support is now available for 64bit ARM
 	* cross-compiling for ARMV8 under iOS now works.
 	* cpu-specific code has been rearranged to make better use of both
 	  hardware commonalities and model-specific compiler optimizations.
 	* XGENE1 has been removed as a TARGET, superseded by the improved generic
 	  ARMV8 support.
 	
 ARMV7:
 	* Older assembly mnemonics have been converted to UAL form to allow
 	  building with clang 7.0
 	* Cross compiling LAPACKE for Android has been fixed again (broken by
 	  update to LAPACK 3.7.0 some while ago).  
 	  
 ====================================================================
 Version 0.3.3
 31-Aug-2018
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #

 # This library's version
 VERSION = 0.3.4.dev
 VERSION = 0.3.5.dev

 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
--- a/Makefile.system
+++ b/Makefile.system
@@ -18,7 +18,7 @@ else ifeq ($(ARCH), i386)
 override ARCH=x86
 else ifeq ($(ARCH), aarch64)
 override ARCH=arm64
 endif 
 endif

 NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib

@@ -1042,6 +1042,8 @@ ifdef USE_TLS
 CCOMMON_OPT += -DUSE_TLS
 endif

 CCOMMON_OPT += -DVERSION=\"$(VERSION)\"

 ifndef SYMBOLPREFIX
 SYMBOLPREFIX =
 endif
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -310,6 +310,8 @@ if (MIXED_MEMORY_ALLOCATION)
  set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
 endif ()

 set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"")

 set(REVISION "-r${OpenBLAS_VERSION}")
 set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})

--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS")
  set(HOST_OS WINNT)
 endif ()

 if (${HOST_OS} STREQUAL "LINUX")
 # check if we're building natively on Android (TERMUX)
    EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
      if(${OPERATING_SYSTEM} MATCHES "Android")
        set(HOST_OS ANDROID)
      endif(${OPERATING_SYSTEM} MATCHES "Android")
 endif()



 if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
              OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -175,9 +175,9 @@ int detect(void){
  return  CPUTYPE_PPC970;
 #endif

 #if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
 #if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
 int id;
 id = __asm __volatile("mfpvr %0" : "=r"(id));
 __asm __volatile("mfpvr %0" : "=r"(id));
 switch ( id >> 16 ) {
  case 0x4e: // POWER9
    return CPUTYPE_POWER8;
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2586,7 +2586,7 @@ void *blas_memory_alloc(int procpos){
  printf("Alloc Start ...\n");
 #endif

 #if defined(WHEREAMI) && !defined(USE_OPENMP)
 /* #if defined(WHEREAMI) && !defined(USE_OPENMP)

  mypos = WhereAmI();

@@ -2596,12 +2596,12 @@ void *blas_memory_alloc(int procpos){
  do {
    if (!memory[position].used && (memory[position].pos == mypos)) {
      LOCK_COMMAND(&alloc_lock);
 /*      blas_lock(&memory[position].lock);*/
 //      blas_lock(&memory[position].lock);

      if (!memory[position].used) goto allocation;

      UNLOCK_COMMAND(&alloc_lock);
 /*      blas_unlock(&memory[position].lock);*/
 //      blas_unlock(&memory[position].lock);
    }

    position ++;
@@ -2609,7 +2609,7 @@ void *blas_memory_alloc(int procpos){
  } while (position < NUM_BUFFERS);


 #endif
 #endif */

  position = 0;

--- a/driver/others/openblas_get_config.c
+++ b/driver/others/openblas_get_config.c
@@ -42,8 +42,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif

 static char* openblas_config_str=""
 "OpenBLAS "
 VERSION
 " "
 #ifdef USE64BITINT
  "USE64BITINT "
  " USE64BITINT "
 #endif
 #ifdef NO_CBLAS
  "NO_CBLAS "
--- a/kernel/mips64/KERNEL
+++ b/kernel/mips64/KERNEL
@@ -1,12 +1,13 @@
 CAXPYKERNEL = ../mips/zaxpy.c
 ZAXPYKERNEL = ../mips/zaxpy.c
 SROTKERNEL = ../mips/rot.c
 DROTKERNEL = ../mips/rot.c
 CROTKERNEL = ../mips/zrot.c
 ZROTKERNEL = ../mips/zrot.c
 SROTKERNEL  = ../mips/rot.c
 DROTKERNEL  = ../mips/rot.c
 CROTKERNEL  = ../mips/zrot.c
 ZROTKERNEL  = ../mips/zrot.c
 CSWAPKERNEL = ../mips/zswap.c
 ZSWAPKERNEL = ../mips/zswap.c

                                                                                        
                                                                                                                                          
 ifndef SNRM2KERNEL
 SNRM2KERNEL = snrm2.S
 endif
--- a/kernel/mips64/KERNEL.LOONGSON3A
+++ b/kernel/mips64/KERNEL.LOONGSON3A
@@ -63,6 +63,7 @@ ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c

 DSDOTKERNEL     = ../mips/dot.c



--- a/kernel/mips64/sgemm_kernel_8x4_ps.S
+++ b/kernel/mips64/sgemm_kernel_8x4_ps.S
@@ -146,11 +146,11 @@
 	sd	$21,  40($sp)
 	sd	$22,  48($sp)

 	ST	$f24, 56($sp)
 	ST	$f25, 64($sp)
 	ST	$f26, 72($sp)
 	ST	$f27, 80($sp)
 	ST	$f28, 88($sp)
 	sdc1	$f24, 56($sp)
 	sdc1	$f25, 64($sp)
 	sdc1	$f26, 72($sp)
 	sdc1	$f27, 80($sp)
 	sdc1	$f28, 88($sp)

 #if defined(TRMMKERNEL)
 	sd	$23,  96($sp)
@@ -161,10 +161,10 @@
 #endif

 #ifndef __64BIT__
 	ST	$f20,120($sp)
 	ST	$f21,128($sp)
 	ST	$f22,136($sp)
 	ST	$f23,144($sp)
 	sdc1	$f20,120($sp)
 	sdc1	$f21,128($sp)
 	sdc1	$f22,136($sp)
 	sdc1	$f23,144($sp)
 #endif

 	.align	4
@@ -7766,11 +7766,11 @@
 	ld	$21,  40($sp)
 	ld	$22,  48($sp)

 	LD	$f24, 56($sp)
 	LD	$f25, 64($sp)
 	LD	$f26, 72($sp)
 	LD	$f27, 80($sp)
 	LD	$f28, 88($sp)
 	ldc1	$f24, 56($sp)
 	ldc1	$f25, 64($sp)
 	ldc1	$f26, 72($sp)
 	ldc1	$f27, 80($sp)
 	ldc1	$f28, 88($sp)

 #if defined(TRMMKERNEL)
 	ld	$23,  96($sp)
@@ -7779,10 +7779,10 @@
 #endif

 #ifndef __64BIT__
 	LD	$f20,120($sp)
 	LD	$f21,128($sp)
 	LD	$f22,136($sp)
 	LD	$f23,144($sp)
 	ldc1	$f20,120($sp)
 	ldc1	$f21,128($sp)
 	ldc1	$f22,136($sp)
 	ldc1	$f23,144($sp)
 #endif

 	daddiu	$sp,$sp,STACKSIZE
--- a/kernel/x86_64/sgemm_beta_skylakex.c
+++ b/kernel/x86_64/sgemm_beta_skylakex.c
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
  }

  if (n == 0 || m == 0)
 	return;
 	return 0;

  c_offset = c;