Merge pull request #102 from xianyi/develop

rebase
5 years ago · 89eea6b455
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
 set(OpenBLAS_PATCH_VERSION 10.dev)
 set(OpenBLAS_PATCH_VERSION 11.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

 # Adhere to GNU filesystem layout conventions
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,76 @@
 OpenBLAS ChangeLog
 ====================================================================
 Version 0.3.11
 17-Oct-2020

 common:
 	* API change:
 	  the newly added BFLOAT16 functions were renamed to use the
 	  letter "B" instead of "H" to avoid potential confusion with
 	  the IEEE "half precision float" type, i.e. the 0.3.10
 	  SHGEMM is now SBGEMM and the corresponding build option
 	  was changed from "BUILD_HALF" to "BUILD_BFLOAT16".
 	* Reduced the default BLAS3_MEM_ALLOC_THRESHOLD (used as an upper
 	  limit for placing temporary arrays on the stack) to be compatible
 	  with a stack size of 1mb (as imposed by the JAVA runtime library) 
 	* Added mixed-precision dot function SBDOT and utility functions
 	  shstobf16, shdtobf16, sbf16tos and dbf16tod to convert between
 	  single or double precision float arrays and bfloat16 arrays
 	* Fixed prototypes of LAPACK_?ggsvp and LAPACK_?ggsvd functions
 	  in lapack.h
 	* Fixed underflow and rounding errors in LAPACK SLANV2 and DLANV2
 	  (causing miscalculations in e.g. SHSEQR/DHSEQR, LAPACK issue #263)
 	* Fixed workspace calculation in LAPACK ?GELQ (LAPACK issue #415)
 	* Fixed several bugs in the LAPACK testsuite
 	* Improved performance of TRMM and TRSM for certain problem sizes
 	* Fixed infinite recursions and workspace miscalculations in ReLAPACK
 	* CMAKE builds no longer require pkg-config for creating the .pc file
 	* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as 
 	  enabling these options
 	* Fixed detection of gfortran when invoked through an mpi wrapper
 	* Improve thread reinitialization performance with OpenMP xafter a fork 
 	* Added support for building only the subset of the library required
 	  for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
 	* Optional function name prefixes and suffixes are now correctly
 	  reflected in the generated cblas.h
 	* Added CMAKE build support for the LAPACK and multithreading tests

 POWER:
 	* Added optimized support for POWER10
 	* Added support for compiling for POWER8 in 32bit mode
 	* Added support for compilation with LLVM/clang
 	* Added support for compilation with NVIDIA/PGI compilers
 	* Fixed building on big-endian POWER8
 	* Fixed miscompilation of ZDOTC by gcc10
 	* Fixed alignment errors in the POWER8 SAXPY kernel
 	* Improved CPU detection on AIX
 	* Supported building with older compilers on POWER9

 x86_64:
 	* Added support for Intel Cooperlake
 	* Added autodetection of AMD Renoir/Matisse/Zen3 cpus
 	* Added autodetection of Intel Comet Lake cpus
 	* Reimplemented ?sum, ?dot and daxpy using universal intrinsics
 	* Reset the fpu state before using the fpu on Windows as a workaround
 	  for a problem introduced in Windows 10 build 19041 (a.k.a. SDK 2004)
 	* Fixed potentially undefined behaviour in the dot and gemv_t kernels
 	* Fixed a potential segmentation fault in DYNAMIC_ARCH builds
 	* Fixed building for ZEN with PGI/NVIDIA and AMD AOCC compilers
 	
 ARMV7:
 	* Fixed cpu detection on BSD-like systems

 ARMV8:
 	* Added preliminary support for Apple Vortex cpus
 	* Added support for the Cavium ThunderX3T110 cpu
 	* Fixed cpu detection on BSD-like systems
 	* Fixed compilation in -std=C18 mode


 IBM Z:
 	* Added support for compiling with the clang compiler
 	* Improved GEMM performance on Z14

 ====================================================================
 Version 0.3.10
 14-Jun-2020
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #

 # This library's version
 VERSION = 0.3.10.dev
 VERSION = 0.3.11.dev

 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -78,6 +78,10 @@ GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
 ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
 CCOMMON_OPT += -mavx2
 endif
 else 
 ifeq ($(C_COMPILER), CLANG)
 CCOMMON_OPT += -mavx2
 endif
 endif
 ifeq ($(F_COMPILER), GFORTRAN)
 # AVX2 support was added in 4.7.0