From e5c47e44f690ef661af893608f47d56b1d3a1cb4 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 30 Dec 2014 21:53:00 -0600 Subject: [PATCH 001/257] First pass at converting a few makefiles to CMake. --- .gitignore | 1 + CMakeLists.txt | 18 ++++++ cmake/c_check.cmake | 29 ++++++++++ cmake/f_check.cmake | 37 ++++++++++++ cmake/prebuild.cmake | 55 ++++++++++++++++++ cmake/system.cmake | 130 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 270 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 cmake/c_check.cmake create mode 100644 cmake/f_check.cmake create mode 100644 cmake/prebuild.cmake create mode 100644 cmake/system.cmake diff --git a/.gitignore b/.gitignore index 7422cead3..bae3d057f 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,4 @@ test/sblat3 test/zblat1 test/zblat2 test/zblat3 +build diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..2dbb6f059 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,18 @@ +## +## Author: Hank Anderson +## Copyright: (c) Stat-Ease, Inc. +## Created: 12/23/14 +## Last Modified: 12/23/14 +## + +cmake_minimum_required(VERSION 2.8.4) +project(OpenBLAS) + +# is this necessary? lapack-netlib has its own fortran checks in its CMakeLists.txt +#enable_language(Fortran) + +message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with.") + +include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") + + diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake new file mode 100644 index 000000000..20c0aa72f --- /dev/null +++ b/cmake/c_check.cmake @@ -0,0 +1,29 @@ +## +## Author: Hank Anderson +## Copyright: (c) Stat-Ease, Inc. +## Created: 12/29/14 +## Last Modified: 12/29/14 +## Description: Ported from the OpenBLAS/c_check perl script. +## This is triggered by prebuild.cmake and runs before any of the code is built. +## Creates config.h and Makefile.conf. + +# N.B. c_check is not cross-platform, so instead try to use CMake variables. Alternatively, could use try_compile to get some of this info the same way c_check does. + +# run c_check (creates the TARGET files) +# message(STATUS "Running c_check...") +# execute_process(COMMAND perl c_check ${TARGET_MAKE} ${TARGET_CONF} ${CMAKE_CXX_COMPILER} +# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) + +# TODO: is ${BINARY} sufficient for the __32BIT__ define? +# TODO: CMAKE_SYSTEM_PROCESSOR is not set by CMake, need to set it manually when doing a cross-compile +# TODO: CMAKE_CXX_COMPILER_ID and CMAKE_SYSTEM_NAME are probably not the same strings as OpenBLAS is expecting +# TODO: detect this +set(NEED_FU 1) + +file(WRITE ${TARGET_CONF} + "#define OS_${CMAKE_SYSTEM_NAME}\t1\n" + "#define ARCH_${CMAKE_SYSTEM_PROCESSOR}\t1\n" + "#define C_${CMAKE_CXX_COMPILER_ID}\t1\n" + "#define __${BINARY}BIT__\t1\n" + "#define FUNDERSCORE\t${NEED_FU}\n") + diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake new file mode 100644 index 000000000..a291430aa --- /dev/null +++ b/cmake/f_check.cmake @@ -0,0 +1,37 @@ +## +## Author: Hank Anderson +## Copyright: (c) Stat-Ease, Inc. +## Created: 12/29/14 +## Last Modified: 12/29/14 +## Description: Ported from the OpenBLAS/f_check perl script. +## This is triggered by prebuild.cmake and runs before any of the code is built. +## Appends Fortran information to config.h and Makefile.conf. + + +if (NOT ${ONLY_CBLAS}) + # N.B. f_check is not cross-platform, so instead try to use CMake variables + # run f_check (appends to TARGET files) +# message(STATUS "Running f_check...") +# execute_process(COMMAND perl f_check ${TARGET_MAKE} ${TARGET_CONF} ${CMAKE_Fortran_COMPILER} +# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) + + # TODO: is BU makefile macro needed? + # TODO: detect whether underscore needed, set #defines appropriately - use try_compile + # TODO: set FEXTRALIB flags a la f_check? + + set(BU "_") + file(APPEND ${TARGET_CONF} + "#define BUNDERSCORE _\n" + "#define NEEDBUNDERSCORE 1\n" + "#define NEED2UNDERSCORES 0\n") + +else () + + #When we only build CBLAS, we set NOFORTRAN=2 + set(NOFORTRAN 2) + set(NO_FBLAS 1) + set(BU "_") + file(APPEND ${TARGET_CONF} + "#define BUNDERSCORE _\n" + "#define NEEDBUNDERSCORE 1\n") +endif() diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake new file mode 100644 index 000000000..34a683a4f --- /dev/null +++ b/cmake/prebuild.cmake @@ -0,0 +1,55 @@ +## +## Author: Hank Anderson +## Copyright: (c) Stat-Ease, Inc. +## Created: 12/29/14 +## Last Modified: 12/29/14 +## Description: Ported from OpenBLAS/Makefile.prebuild +## This is triggered by system.cmake and runs before any of the code is built. +## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files). +## Next it runs f_check and appends some fortran information to the files. +## Finally it runs getarch and getarch_2nd for even more environment information. + +# CPUIDEMU = ../../cpuid/table.o + +if (DEFINED CPUIDEMU) + set(EXFLAGS "-DCPUIDEMU -DVENDOR=99") +endif () + +if (DEFINED TARGET_CORE) + # set the C flags for just this file + set_source_files_properties(getarch_2nd.c PROPERTIES COMPILE_FLAGS "-DBUILD_KERNEL") + set(TARGET_MAKE "Makefile_kernel.conf") + set(TARGET_CONF "config_kernel.h") +else() + set(TARGET_MAKE "Makefile.conf") + set(TARGET_CONF "config.h") +endif () + +include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") +include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") + +# compile getarch +# TODO: need to use execute_process here, or compilation won't happen until later - maybe make temporary CMakeLists.txt file using file() ? +#add_executable(getarch getarch.c cpuid.S ${CPUIDEMU} +# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) +# +## run getarch, which appends even more to the TARGET files +#message(STATUS "Running getarch") +#execute_process(COMMAND getarch 0 >> ${TARGET_MAKE} +# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) +#execute_process(COMMAND getarch 1 >> ${TARGET_CONF} +# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) +# +## config.h is ready for getarch_2nd now, so compile that +#set(GETARCH2_SOURCES getarch_2nd.c config.h) +#add_executable(getarch_2nd getarch_2nd.c config.h) +# +## finally run getarch_2nd, appending yet more to the TARGET files +#message(STATUS "Running getarch_2nd") +#execute_process(COMMAND getarch_2nd 0 >> ${TARGET_MAKE} +# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) +#execute_process(COMMAND getarch_2nd 1 >> ${TARGET_CONF} +# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) + +# TODO: need to read in the vars from Makefile.conf/Makefile_kernel.conf + diff --git a/cmake/system.cmake b/cmake/system.cmake new file mode 100644 index 000000000..e5c66f3ed --- /dev/null +++ b/cmake/system.cmake @@ -0,0 +1,130 @@ +## +## Author: Hank Anderson +## Copyright: (c) Stat-Ease, Inc. +## Created: 12/29/14 +## Last Modified: 12/29/14 +## Description: Ported from OpenBLAS/Makefile.system +## + +set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib") + +# TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa +# http://stackoverflow.com/questions/714100/os-detecting-makefile + +# TODO: Makefile.system sets HOSTCC = $(CC) here if not already set -hpa + +# TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. +if (DEFINED TARGET_CORE) + set(TARGET ${TARGET_CORE}) +endif () + +# Force fallbacks for 32bit +if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) + message(STATUS "Compiling a ${BINARY}-bit binary.") + set(NO_AVX 1) + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") + set(TARGET "NEHALEM") + endif () + if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER") + set(TARGET "BARCELONA") + endif () +endif () + +if (DEFINED TARGET) + message(STATUS "Targetting the ${TARGET} architecture.") + set(GETARCH_FLAGS "-DFORCE_${TARGET}") +endif () + +if (${INTERFACE64}) + message(STATUS "Using 64-bit integers.") + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") +endif () + +if (NOT DEFINED GEMM_MULTITHREAD_THRESHOLD) + set(GEMM_MULTITHREAD_THRESHOLD 4) +endif () +message(STATUS "GEMM multithread threshold set to ${GEMM_MULTITHREAD_THRESHOLD}.") +set(GETARCH_FLAGS "${GETARCH_FLAGS} -DGEMM_MULTITHREAD_THRESHOLD=${GEMM_MULTITHREAD_THRESHOLD}") + +if (${NO_AVX}) + message(STATUS "Disabling Advanced Vector Extensions (AVX).") + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX") +endif () + +if (${NO_AVX2}) + message(STATUS "Disabling Advanced Vector Extensions 2 (AVX2).") + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX2") +endif () + +if (CMAKE_BUILD_TYPE STREQUAL Debug) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -g") +endif () + +# TODO: let CMake handle this? -hpa +#if (${QUIET_MAKE}) +# set(MAKE "${MAKE} -s") +#endif() + +if (NOT DEFINED NO_PARALLEL_MAKE) + set(NO_PARALLEL_MAKE 0) +endif () +set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_PARALLEL_MAKE=${NO_PARALLEL_MAKE}") + +if (CMAKE_CXX_COMPILER STREQUAL loongcc) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -static") +endif () + +#if don't use Fortran, it will only compile CBLAS. +if (${ONLY_CBLAS}) + set(NO_LAPACK 1) +else () + set(ONLY_CBLAS 0) +endif () + +include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake") + +if (NOT DEFINED NUM_THREADS) + # TODO: NUM_CORES comes from `getarch.c` or `cpuid_x86.c`. This is built and executed above in `Makefile.prebuild`, and the results are in `Makefile.conf` and `Makefile_kernel.conf`. -hpa + set(NUM_THREADS ${NUM_CORES}) +endif () + +if (NUM_THREADS EQUALS 1) + # TODO: was "override USE_THREAD = 0", do we need override here? -hpa + set(USE_THREAD 0) +endif () + +if (DEFINED USE_THREAD) + if (NOT ${USE_THREAD}) + unset(SMP) + else () + set(SMP 1) + endif () +else () + # N.B. this is NUM_THREAD in Makefile.system which is probably a bug -hpa + if (${NUM_THREADS} EQUALS 1) + unset(SMP) + else () + set(SMP 1) + endif () +endif () + +if (${SMP}) + message("SMP enabled.") +endif () + +if (NOT DEFINED NEED_PIC) + set(NEED_PIC 1) +endif () + +# TODO: I think CMake should be handling all this stuff -hpa +unset(ARFLAGS) +set(CPP "${COMPILER} -E") +set(AR "${CROSS_SUFFIX}ar") +set(AS "$(CROSS_SUFFIX)as") +set(LD "$(CROSS_SUFFIX)ld") +set(RANLIB "$(CROSS_SUFFIX)ranlib") +set(NM "$(CROSS_SUFFIX)nm") +set(DLLWRAP "$(CROSS_SUFFIX)dllwrap") +set(OBJCOPY "$(CROSS_SUFFIX)objcopy") +set(OBJCONV "$(CROSS_SUFFIX)objconv") + From 1a41022e3ef23d3d3d52e9884763e263a55fbe02 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 1 Jan 2015 21:01:28 -0600 Subject: [PATCH 002/257] Added MSVC defines to cpuid.h and getarch.c. --- cpuid.h | 6 +++++- getarch.c | 14 +++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/cpuid.h b/cpuid.h index ab6a3fb32..406b7fa25 100644 --- a/cpuid.h +++ b/cpuid.h @@ -39,6 +39,10 @@ #ifndef CPUID_H #define CPUID_H +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +#define INTEL_AMD +#endif + #define VENDOR_INTEL 1 #define VENDOR_UMC 2 #define VENDOR_AMD 3 @@ -59,7 +63,7 @@ #define FAMILY_PM 7 #define FAMILY_IA64 8 -#if defined(__i386__) || defined(__x86_64__) +#ifdef INTEL_AMD #define GET_EXFAMILY 1 #define GET_EXMODEL 2 #define GET_TYPE 3 diff --git a/getarch.c b/getarch.c index f6a5ecb94..8a6b4dcd1 100644 --- a/getarch.c +++ b/getarch.c @@ -69,10 +69,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#if defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) +#if defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) || defined(_WIN32) || defined(_WIN64) #define OS_WINDOWS #endif +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +#define INTEL_AMD +#endif + #include #include #ifdef OS_WINDOWS @@ -783,7 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif -#if defined(__i386__) || (__x86_64__) +#ifdef INTEL_AMD #include "cpuid_x86.c" #define OPENBLAS_SUPPORTED #endif @@ -878,7 +882,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -898,7 +902,7 @@ int main(int argc, char *argv[]){ #endif -#if defined(__i386__) || defined(__x86_64__) +#ifdef INTEL_AMD #ifndef FORCE get_sse(); #else @@ -978,7 +982,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif From 92cdac5f876b781186a530abba6fdc49c310802c Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 1 Jan 2015 21:02:48 -0600 Subject: [PATCH 003/257] Added MSVC functions to cpuid_x86.c to replace gcc-specific ASM. --- cpuid_x86.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index ef90b26d8..6b7e408d8 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -40,6 +40,12 @@ #include #include "cpuid.h" +#ifdef _MSC_VER +#define C_INLINE __inline +#else +#define C_INLINE inline +#endif + /* #ifdef NO_AVX #define CPUTYPE_HASWELL CPUTYPE_NEHALEM @@ -53,12 +59,26 @@ #endif */ +#ifdef _MSC_VER + +void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) +{ + int cpuInfo[4] = {-1}; + __cpuid(cpuInfo, op); + *eax = cpuInfo[0]; + *ebx = cpuInfo[1]; + *ecx = cpuInfo[2]; + *edx = cpuInfo[3]; +} + +#else + #ifndef CPUIDEMU #if defined(__APPLE__) && defined(__i386__) void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); #else -static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ +static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ #if defined(__i386__) && defined(__PIC__) __asm__ __volatile__ ("mov %%ebx, %%edi;" @@ -115,14 +135,16 @@ void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int * #endif -static inline int have_cpuid(void){ +#endif // _MSC_VER + +static C_INLINE int have_cpuid(void){ int eax, ebx, ecx, edx; cpuid(0, &eax, &ebx, &ecx, &edx); return eax; } -static inline int have_excpuid(void){ +static C_INLINE int have_excpuid(void){ int eax, ebx, ecx, edx; cpuid(0x80000000, &eax, &ebx, &ecx, &edx); @@ -130,10 +152,14 @@ static inline int have_excpuid(void){ } #ifndef NO_AVX -static inline void xgetbv(int op, int * eax, int * edx){ +static C_INLINE void xgetbv(int op, int * eax, int * edx){ //Use binary code for xgetbv +#ifdef _MSC_VER + *eax = __xgetbv(op); +#else __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); +#endif } #endif From 0f6bec0a32cea93deff9eb9261827a7825f56a90 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 1 Jan 2015 21:03:17 -0600 Subject: [PATCH 004/257] cmake.prebuild now compiles getarch. Doesn't actually run it yet. --- cmake/prebuild.cmake | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 34a683a4f..200d03692 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -29,7 +29,20 @@ include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") # compile getarch -# TODO: need to use execute_process here, or compilation won't happen until later - maybe make temporary CMakeLists.txt file using file() ? +enable_language(ASM) +set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") +file(MAKE_DIRECTORY ${GETARCH_DIR}) +try_compile(GETARCH_RESULT ${GETARCH_DIR} + SOURCES ${CMAKE_SOURCE_DIR}/getarch.c ${CMAKE_SOURCE_DIR}/cpuid.S ${CPUIDEMO} + COMPILE_DEFINITIONS ${EXFLAGS} -I${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH_LOG + ) + +message(STATUS "GETARCH RESULT: ${GETARCH_RESULT}") +message(STATUS "GETARCH LOG: ${GETARCH_LOG}") + +# TODO: need to append output of getarch binary to TARGET_CONF, not sure if I can get at it after using try_compile - may need to create CMakeLists.txt on the fly and build/execute + #add_executable(getarch getarch.c cpuid.S ${CPUIDEMU} # WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) # From f4d1e7a2650c985722259fbcd594559397095743 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 27 Jan 2015 11:37:39 -0600 Subject: [PATCH 005/257] Hardcoded NUM_CORES to get system.cmake working. --- cmake/prebuild.cmake | 40 ++++++++++++++++++++++++++++++++++++++++ cmake/system.cmake | 6 +++--- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 200d03692..ffebbe30f 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -9,6 +9,43 @@ ## Next it runs f_check and appends some fortran information to the files. ## Finally it runs getarch and getarch_2nd for even more environment information. +# List of vars set by this file and included files: +# OSNAME +# ARCH +# C_COMPILER +# BINARY32 +# BINARY64 +# CEXTRALIB +# F_COMPILER +# FC +# BU +# CORE +# LIBCORE +# NUM_CORES <- REQUIRED +# HAVE_MMX +# HAVE_SSE +# HAVE_SSE2 +# HAVE_SSE3 +# MAKE +# SGEMM_UNROLL_M +# SGEMM_UNROLL_N +# DGEMM_UNROLL_M +# DGEMM_UNROLL_M +# QGEMM_UNROLL_N +# QGEMM_UNROLL_N +# CGEMM_UNROLL_M +# CGEMM_UNROLL_M +# ZGEMM_UNROLL_N +# ZGEMM_UNROLL_N +# XGEMM_UNROLL_M +# XGEMM_UNROLL_N +# CGEMM3M_UNROLL_M +# CGEMM3M_UNROLL_N +# ZGEMM3M_UNROLL_M +# ZGEMM3M_UNROLL_M +# XGEMM3M_UNROLL_N +# XGEMM3M_UNROLL_N + # CPUIDEMU = ../../cpuid/table.o if (DEFINED CPUIDEMU) @@ -66,3 +103,6 @@ message(STATUS "GETARCH LOG: ${GETARCH_LOG}") # TODO: need to read in the vars from Makefile.conf/Makefile_kernel.conf +# temporarily hardcoded to get system.cmake working +set(NUM_CORES 4) + diff --git a/cmake/system.cmake b/cmake/system.cmake index e5c66f3ed..dc5aec2f2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -88,8 +88,8 @@ if (NOT DEFINED NUM_THREADS) set(NUM_THREADS ${NUM_CORES}) endif () -if (NUM_THREADS EQUALS 1) - # TODO: was "override USE_THREAD = 0", do we need override here? -hpa +if (${NUM_THREADS} EQUAL 1) + # TODO: was "override USE_THREAD = 0", do we need "override" here? -hpa set(USE_THREAD 0) endif () @@ -101,7 +101,7 @@ if (DEFINED USE_THREAD) endif () else () # N.B. this is NUM_THREAD in Makefile.system which is probably a bug -hpa - if (${NUM_THREADS} EQUALS 1) + if (${NUM_THREADS} EQUAL 1) unset(SMP) else () set(SMP 1) From d2d15e522f04344be4a1d1cae24f6ef96dedbc00 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 27 Jan 2015 12:23:35 -0600 Subject: [PATCH 006/257] Started converting lib target to CMake. The main part of this target is looping through the BLAS subfolders and calling make on them. Need to add CMakeLists.txt for each of these subfolders. --- CMakeLists.txt | 40 ++++++++++++++++++++++++++++++++++++++++ cmake/prebuild.cmake | 3 ++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2dbb6f059..6bca1899e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,4 +15,44 @@ message(WARNING "CMake support is experimental. This will not produce the same M include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") +set(BLASDIRS interface driver/level2 driver/level3 driver/others) + +if (NOT ${DYNAMIC_ARCH}) + list(APPEND BLASDIRS kernel) +endif () + +if (DEFINED UTEST_CHECK) + set(SANITY_CHECK 1) +endif () + +if (DEFINED SANITY_CHECK) + list(APPEND BLASDIRS reference) +endif () + +set(SUBDIRS ${BLASDIRS}) +if (NOT ${NO_LAPACK}) + list(APPEND SUBDIRS lapack) +endif () + +set(SUBDIRS_ALL ${SUBDIRS} test ctest utest exports benchmark ../laswp ../bench) + +# all :: libs netlib tests shared + +# libs: +if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") + message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.") +endif () + +# Let CMake handle this +#if (${NOFORTRAN}) +# message(ERROR "OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.") +#endif () + +if (${NO_STATIC} AND ${NO_SHARED}) + message(FATAL_ERROR "Neither static nor shared are enabled.") +endif () + +foreach (BLAS_DIR ${BLASDIRS}) + add_subdirectory(${BLAS_DIR}) +endforeach () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index ffebbe30f..ded9f2ce0 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -19,7 +19,7 @@ # F_COMPILER # FC # BU -# CORE +# CORE <- REQUIRED # LIBCORE # NUM_CORES <- REQUIRED # HAVE_MMX @@ -105,4 +105,5 @@ message(STATUS "GETARCH LOG: ${GETARCH_LOG}") # temporarily hardcoded to get system.cmake working set(NUM_CORES 4) +set(CORE "GENERIC") From 864b8b31de8dad0ba45b333fe1c04e4ed667a7a5 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 27 Jan 2015 13:54:29 -0600 Subject: [PATCH 007/257] Fixed incorrect case in OS_ definition in c_check. --- cmake/c_check.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 20c0aa72f..5669c723a 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -17,11 +17,12 @@ # TODO: is ${BINARY} sufficient for the __32BIT__ define? # TODO: CMAKE_SYSTEM_PROCESSOR is not set by CMake, need to set it manually when doing a cross-compile # TODO: CMAKE_CXX_COMPILER_ID and CMAKE_SYSTEM_NAME are probably not the same strings as OpenBLAS is expecting -# TODO: detect this +# TODO: detect NEED_FU set(NEED_FU 1) +string(TOUPPER ${CMAKE_SYSTEM_NAME} CMAKE_SYSTEM_NAME_UC) file(WRITE ${TARGET_CONF} - "#define OS_${CMAKE_SYSTEM_NAME}\t1\n" + "#define OS_${CMAKE_SYSTEM_NAME_UC}\t1\n" "#define ARCH_${CMAKE_SYSTEM_PROCESSOR}\t1\n" "#define C_${CMAKE_CXX_COMPILER_ID}\t1\n" "#define __${BINARY}BIT__\t1\n" From 1e8bb0e0e02474d70708a9b555e3ca0227b5422c Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 27 Jan 2015 14:03:46 -0600 Subject: [PATCH 008/257] Fixed architecture detection when AMD64 in c_check. --- cmake/c_check.cmake | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 5669c723a..07ed8a178 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -20,10 +20,16 @@ # TODO: detect NEED_FU set(NEED_FU 1) -string(TOUPPER ${CMAKE_SYSTEM_NAME} CMAKE_SYSTEM_NAME_UC) +# Convert CMake vars into the format that OpenBLAS expects +string(TOUPPER ${CMAKE_SYSTEM_NAME} HOST_OS) +set(HOST_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +if (${HOST_ARCH} STREQUAL "AMD64") + set(HOST_ARCH "X86_64") +endif () + file(WRITE ${TARGET_CONF} - "#define OS_${CMAKE_SYSTEM_NAME_UC}\t1\n" - "#define ARCH_${CMAKE_SYSTEM_PROCESSOR}\t1\n" + "#define OS_${HOST_OS}\t1\n" + "#define ARCH_${HOST_ARCH}\t1\n" "#define C_${CMAKE_CXX_COMPILER_ID}\t1\n" "#define __${BINARY}BIT__\t1\n" "#define FUNDERSCORE\t${NEED_FU}\n") From 5eefe18ae4c1c018201aaa36df2f1c889b190b0e Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 27 Jan 2015 16:17:17 -0600 Subject: [PATCH 009/257] Added CMakeLists.txt for the first of the BLAS folders. It only does the double precision compile currently. I realized I didn't finish converting Makefile.system yet, so I made a note of that. --- .gitignore | 1 + CMakeLists.txt | 22 +++++++++++++++++++ cmake/system.cmake | 2 ++ interface/CMakeLists.txt | 46 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 71 insertions(+) create mode 100644 interface/CMakeLists.txt diff --git a/.gitignore b/.gitignore index bae3d057f..3e163abef 100644 --- a/.gitignore +++ b/.gitignore @@ -66,3 +66,4 @@ test/zblat1 test/zblat2 test/zblat3 build +build.* diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bca1899e..25b88d565 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,3 +56,25 @@ foreach (BLAS_DIR ${BLASDIRS}) add_subdirectory(${BLAS_DIR}) endforeach () +#Save the config files for installation +# @cp Makefile.conf Makefile.conf_last +# @cp config.h config_last.h +#ifdef QUAD_PRECISION +# @echo "#define QUAD_PRECISION">> config_last.h +#endif +#ifeq ($(EXPRECISION), 1) +# @echo "#define EXPRECISION">> config_last.h +#endif +### +#ifeq ($(DYNAMIC_ARCH), 1) +# @$(MAKE) -C kernel commonlibs || exit 1 +# @for d in $(DYNAMIC_CORE) ; \ +# do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ +# done +# @echo DYNAMIC_ARCH=1 >> Makefile.conf_last +#endif +#ifdef USE_THREAD +# @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last +#endif +# @touch lib.grd + diff --git a/cmake/system.cmake b/cmake/system.cmake index dc5aec2f2..11f0c5cdd 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -128,3 +128,5 @@ set(DLLWRAP "$(CROSS_SUFFIX)dllwrap") set(OBJCOPY "$(CROSS_SUFFIX)objcopy") set(OBJCONV "$(CROSS_SUFFIX)objconv") +# TODO: convert rest of Makefile.system, left off at "OS dependent settings" + diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt new file mode 100644 index 000000000..62a889f49 --- /dev/null +++ b/interface/CMakeLists.txt @@ -0,0 +1,46 @@ + +include_directories(${CMAKE_SOURCE_DIR}) + +# TODO: Need to generate object files for S, D, C, Q and X - start with D for now. +# The sources are the same, but there are additional preprocessor definitions depending on the precision (see Makefile.tail). + +add_library(DBLAS1OBJS OBJECT + axpy.c swap.c + copy.c scal.c + dot.c + asum.c nrm2.c + max.c # amax/min/amin compiled later from same source + rot.c rotg.c rotm.c rotmg.c + axpby.c +) + +# N.B. The original Makefile passed in -UUSE_MIN and -UUSE_ABS (where appropriate), no way to do that at a source-level in cmake. REMOVE_DEFINITIONS removes a definition for the rest of the compilation. +add_library(AMAX_OBJ OBJECT max.c) +set_target_properties(AMAX_OBJ PROPERTIES COMPILE_DEFINITIONS USE_ABS) +add_library(AMIN_OBJ OBJECT max.c) +set_target_properties(AMIN_OBJ PROPERTIES COMPILE_DEFINITIONS USE_ABS) +set_target_properties(AMIN_OBJ PROPERTIES COMPILE_DEFINITIONS USE_MIN) +add_library(MIN_OBJ OBJECT max.c) +set_target_properties(MIN_OBJ PROPERTIES COMPILE_DEFINITIONS USE_MIN) + +# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f +add_library(DBLAS2OBJS OBJECT + gemv.c ger.c + trsv.c trmv.c symv.c + syr.c syr2.c gbmv.c + sbmv.c spmv.c + spr.c spr2.c + tbsv.c tbmv.c + tpsv.c tpmv.c +) + +add_library(DBLAS3OBJS OBJECT + gemm.c symm.c + trsm.c syrk.c syr2k.c + omatcopy.c imatcopy.c +) + +# trmm is trsm with a compiler flag set +add_library(TRMM_OBJ OBJECT trsm.c) +set_target_properties(TRMM_OBJ PROPERTIES COMPILE_DEFINITIONS TRMM) + From 9a508abdc7f810df8c09e94845ef17338724fab0 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 28 Jan 2015 14:52:15 -0600 Subject: [PATCH 010/257] Added first pass at driver/level2 makefile conversion. --- driver/level2/CMakeLists.txt | 86 ++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 driver/level2/CMakeLists.txt diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt new file mode 100644 index 000000000..8dc37a880 --- /dev/null +++ b/driver/level2/CMakeLists.txt @@ -0,0 +1,86 @@ + +# sources that need to be compiled twice, once with no flags and once with LOWER +set(UL_SOURCES + sbmv_k.c + spmv_k.c + spr_k.c + spr2_k.c + syr_k.c + syr2_k.c +) + +# sources that need to be compiled several times, for UNIT, TRANS +set(NU_SOURCES + tbmv_U.c + tbsv_U.c + tpmv_U.c + tpsv_U.c + trmv_U.c + trsv_U.c + tbmv_L.c + tbsv_L.c + tpmv_L.c + tpsv_L.c + trmv_L.c + trsv_L.c +) + +# first compile all the objects that don't need specific preprocessor defines +add_library(DBLAS_NONE OBJECT + gbmv_k.c # gbmv_N + ${UL_SOURCES} + ${NU_SOURCES} +) + +# then do objects with transpose/triangular/etc definitions + +# objects that need TRANS set +add_library(DBLAS_T OBJECT gbmv_k.c ${NU_SOURCES}) +set_target_properties(DBLAS_T PROPERTIES COMPILE_DEFINITIONS TRANS) + +# objects that need LOWER set +add_library(DBLAS_L OBJECT ${UL_SOURCES}) +set_target_properties(DBLAS_L PROPERTIES COMPILE_DEFINITIONS LOWER) + +# objects that need UNIT set +add_library(DBLAS_U OBJECT ${NU_SOURCES}) +set_target_properties(DBLAS_U PROPERTIES COMPILE_DEFINITIONS UNIT) + +# objects that need TRANS and UNIT set +add_library(DBLAS_TU OBJECT ${NU_SOURCES}) +set_target_properties(DBLAS_TU PROPERTIES COMPILE_DEFINITIONS UNIT) +set_target_properties(DBLAS_TU PROPERTIES COMPILE_DEFINITIONS TRANS) + +#if (DEFINED SMP) +# add_library(DBLASOBJS_SMP +# dgemv_thread_n.c dgemv_thread_t.c +# dger_thread.c +# dsymv_thread_U.c dsymv_thread_L.c +# dsyr_thread_U.c dsyr_thread_L.c +# dsyr2_thread_U.c dsyr2_thread_L.c +# dspr_thread_U.c dspr_thread_L.c +# dspr2_thread_U.c dspr2_thread_L.c +# dtrmv_thread_NUU.c dtrmv_thread_NUN.c +# dtrmv_thread_NLU.c dtrmv_thread_NLN.c +# dtrmv_thread_TUU.c dtrmv_thread_TUN.c +# dtrmv_thread_TLU.c dtrmv_thread_TLN.c +# dspmv_thread_U.c dspmv_thread_L.c +# dtpmv_thread_NUU.c dtpmv_thread_NUN.c +# dtpmv_thread_NLU.c dtpmv_thread_NLN.c +# dtpmv_thread_TUU.c dtpmv_thread_TUN.c +# dtpmv_thread_TLU.c dtpmv_thread_TLN.c +# dgbmv_thread_n.c dgbmv_thread_t.c +# dsbmv_thread_U.c dsbmv_thread_L.c +# dtbmv_thread_NUU.c dtbmv_thread_NUN.c +# dtbmv_thread_NLU.c dtbmv_thread_NLN.c +# dtbmv_thread_TUU.c dtbmv_thread_TUN.c +# dtbmv_thread_TLU.c dtbmv_thread_TLN.c +# ) +#endif () + +set(DBLAS_TARGETS DBLAS_NONE DBLAS_T DBLAS_L DBLAS_U DBLAS_TU) + +foreach (${TARGET} ${DBLAS_TARGETS}) + set_target_properties(${TARGET} PROPERTIES COMPILE_DEFINITIONS DOUBLE) +endforeach () + From c5f5c7a0769c852b569d734fe03e7559039820ea Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 28 Jan 2015 15:47:47 -0600 Subject: [PATCH 011/257] Updated c_check OS/compiler/bits detection. --- cmake/c_check.cmake | 37 +++++++++++++++++++++++++++---------- cmake/prebuild.cmake | 2 +- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 07ed8a178..d8facfedc 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -7,30 +7,47 @@ ## This is triggered by prebuild.cmake and runs before any of the code is built. ## Creates config.h and Makefile.conf. -# N.B. c_check is not cross-platform, so instead try to use CMake variables. Alternatively, could use try_compile to get some of this info the same way c_check does. +# N.B. c_check (and ctest.c) is not cross-platform, so instead try to use CMake variables. -# run c_check (creates the TARGET files) -# message(STATUS "Running c_check...") -# execute_process(COMMAND perl c_check ${TARGET_MAKE} ${TARGET_CONF} ${CMAKE_CXX_COMPILER} -# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) - -# TODO: is ${BINARY} sufficient for the __32BIT__ define? -# TODO: CMAKE_SYSTEM_PROCESSOR is not set by CMake, need to set it manually when doing a cross-compile -# TODO: CMAKE_CXX_COMPILER_ID and CMAKE_SYSTEM_NAME are probably not the same strings as OpenBLAS is expecting # TODO: detect NEED_FU set(NEED_FU 1) # Convert CMake vars into the format that OpenBLAS expects string(TOUPPER ${CMAKE_SYSTEM_NAME} HOST_OS) +if (${HOST_OS} STREQUAL "WINDOWS") + set(HOST_OS WINNT) +endif () + +# added by hpa - check size of void ptr to detect 64-bit compile +if (NOT DEFINED BINARY) + set(BINARY 32) + if (CMAKE_SIZEOF_VOID_P EQUAL 8) + set(BINARY 64) + endif () +endif () + +# CMake docs define these: +# CMAKE_SYSTEM_PROCESSOR - The name of the CPU CMake is building for. +# CMAKE_HOST_SYSTEM_PROCESSOR - The name of the CPU CMake is running on. set(HOST_ARCH ${CMAKE_SYSTEM_PROCESSOR}) if (${HOST_ARCH} STREQUAL "AMD64") set(HOST_ARCH "X86_64") endif () +# If you are using a 32-bit compiler on a 64-bit system CMAKE_SYSTEM_PROCESSOR will be wrong +if (${HOST_ARCH} STREQUAL "X86_64" AND BINARY EQUAL 32) + set(HOST_ARCH X86) +endif () + +set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) +if (${COMPILER_ID} STREQUAL "GNU") + set(COMPILER_ID "GCC") +endif () + file(WRITE ${TARGET_CONF} "#define OS_${HOST_OS}\t1\n" "#define ARCH_${HOST_ARCH}\t1\n" - "#define C_${CMAKE_CXX_COMPILER_ID}\t1\n" + "#define C_${COMPILER_ID}\t1\n" "#define __${BINARY}BIT__\t1\n" "#define FUNDERSCORE\t${NEED_FU}\n") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index ded9f2ce0..a4faa131d 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -78,7 +78,7 @@ try_compile(GETARCH_RESULT ${GETARCH_DIR} message(STATUS "GETARCH RESULT: ${GETARCH_RESULT}") message(STATUS "GETARCH LOG: ${GETARCH_LOG}") -# TODO: need to append output of getarch binary to TARGET_CONF, not sure if I can get at it after using try_compile - may need to create CMakeLists.txt on the fly and build/execute +# TODO: need to append output of getarch binary to TARGET_CONF, use COPY_FILE param (look at try_compile docs) to copy the resulting binary somewhere then run it #add_executable(getarch getarch.c cpuid.S ${CPUIDEMU} # WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) From 1c5b6bb4f7fd843433b922ed1976d4a05f46a27c Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 28 Jan 2015 16:33:48 -0600 Subject: [PATCH 012/257] Added CORE define to config.h in prebuild.cmake (temporarily). --- cmake/prebuild.cmake | 4 ++++ driver/level2/CMakeLists.txt | 2 ++ 2 files changed, 6 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index a4faa131d..76f74e049 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -106,4 +106,8 @@ message(STATUS "GETARCH LOG: ${GETARCH_LOG}") # temporarily hardcoded to get system.cmake working set(NUM_CORES 4) set(CORE "GENERIC") +# TODO: this should be done by getarch! see above +file(APPEND ${TARGET_CONF} + "#define ${CORE}" +) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 8dc37a880..d06d03ccf 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -1,4 +1,6 @@ +include_directories(${CMAKE_SOURCE_DIR}) + # sources that need to be compiled twice, once with no flags and once with LOWER set(UL_SOURCES sbmv_k.c From 8ede4a8da49a26a89bc6d1f018a4e33103c43c31 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 28 Jan 2015 17:18:26 -0600 Subject: [PATCH 013/257] getarch now compiles and sets config.h defines properly. Still isn't parsed into CMake variables, and getarch_2 needs to get the same treatment. --- cmake/prebuild.cmake | 39 +++++++++++++++--------------------- cmake/system.cmake | 2 +- driver/level2/CMakeLists.txt | 4 ++-- 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 76f74e049..ba0e0789e 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -68,29 +68,27 @@ include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") # compile getarch enable_language(ASM) set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") +set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) try_compile(GETARCH_RESULT ${GETARCH_DIR} SOURCES ${CMAKE_SOURCE_DIR}/getarch.c ${CMAKE_SOURCE_DIR}/cpuid.S ${CPUIDEMO} - COMPILE_DEFINITIONS ${EXFLAGS} -I${CMAKE_SOURCE_DIR} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE GETARCH_LOG - ) + COPY_FILE ${GETARCH_BIN} +) -message(STATUS "GETARCH RESULT: ${GETARCH_RESULT}") -message(STATUS "GETARCH LOG: ${GETARCH_LOG}") +message(STATUS "Running getarch") -# TODO: need to append output of getarch binary to TARGET_CONF, use COPY_FILE param (look at try_compile docs) to copy the resulting binary somewhere then run it +# use the cmake binary w/ the -E param to run a shell command in a cross-platform way +execute_process(COMMAND ${GETARCH_BIN} 0 OUTPUT_VARIABLE GETARCH_MAKE_OUT) +execute_process(COMMAND ${GETARCH_BIN} 1 OUTPUT_VARIABLE GETARCH_CONF_OUT) -#add_executable(getarch getarch.c cpuid.S ${CPUIDEMU} -# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) -# -## run getarch, which appends even more to the TARGET files -#message(STATUS "Running getarch") -#execute_process(COMMAND getarch 0 >> ${TARGET_MAKE} -# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) -#execute_process(COMMAND getarch 1 >> ${TARGET_CONF} -# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) -# -## config.h is ready for getarch_2nd now, so compile that +message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") + +# append config data from getarch even more to the TARGET file +file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT}) + +## TODO: config.h is ready for getarch_2nd now, so compile that #set(GETARCH2_SOURCES getarch_2nd.c config.h) #add_executable(getarch_2nd getarch_2nd.c config.h) # @@ -101,13 +99,8 @@ message(STATUS "GETARCH LOG: ${GETARCH_LOG}") #execute_process(COMMAND getarch_2nd 1 >> ${TARGET_CONF} # WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) -# TODO: need to read in the vars from Makefile.conf/Makefile_kernel.conf - -# temporarily hardcoded to get system.cmake working +# TODO: parse the MAKE variables from getarch/getarch2 (GETARCH_MAKE_OUT) into CMAKE vars +# for now I temporarily hardcoded to get system.cmake working set(NUM_CORES 4) set(CORE "GENERIC") -# TODO: this should be done by getarch! see above -file(APPEND ${TARGET_CONF} - "#define ${CORE}" -) diff --git a/cmake/system.cmake b/cmake/system.cmake index 11f0c5cdd..0753ed028 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -109,7 +109,7 @@ else () endif () if (${SMP}) - message("SMP enabled.") + message(STATUS "SMP enabled.") endif () if (NOT DEFINED NEED_PIC) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index d06d03ccf..990337fe1 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -82,7 +82,7 @@ set_target_properties(DBLAS_TU PROPERTIES COMPILE_DEFINITIONS TRANS) set(DBLAS_TARGETS DBLAS_NONE DBLAS_T DBLAS_L DBLAS_U DBLAS_TU) -foreach (${TARGET} ${DBLAS_TARGETS}) - set_target_properties(${TARGET} PROPERTIES COMPILE_DEFINITIONS DOUBLE) +foreach (${DBLAS_TARGET} ${DBLAS_TARGETS}) + set_target_properties(${DBLAS_TARGET} PROPERTIES COMPILE_DEFINITIONS DOUBLE) endforeach () From 61f21b5d036a221779ee7f467ca0dd856aea7f8f Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 28 Jan 2015 22:20:15 -0600 Subject: [PATCH 014/257] getarch_2nd now appends its output to config.h/config_kernel.h --- cmake/prebuild.cmake | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index ba0e0789e..fa6621cd1 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -54,7 +54,7 @@ endif () if (DEFINED TARGET_CORE) # set the C flags for just this file - set_source_files_properties(getarch_2nd.c PROPERTIES COMPILE_FLAGS "-DBUILD_KERNEL") + set(GETARCH2_FLAGS "-DBUILD_KERNEL") set(TARGET_MAKE "Makefile_kernel.conf") set(TARGET_CONF "config_kernel.h") else() @@ -85,19 +85,31 @@ execute_process(COMMAND ${GETARCH_BIN} 1 OUTPUT_VARIABLE GETARCH_CONF_OUT) message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") -# append config data from getarch even more to the TARGET file +# append config data from getarch to the TARGET file file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT}) ## TODO: config.h is ready for getarch_2nd now, so compile that -#set(GETARCH2_SOURCES getarch_2nd.c config.h) -#add_executable(getarch_2nd getarch_2nd.c config.h) -# -## finally run getarch_2nd, appending yet more to the TARGET files -#message(STATUS "Running getarch_2nd") -#execute_process(COMMAND getarch_2nd 0 >> ${TARGET_MAKE} -# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) -#execute_process(COMMAND getarch_2nd 1 >> ${TARGET_CONF} -# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) +set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") +set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") +file(MAKE_DIRECTORY ${GETARCH2_DIR}) +try_compile(GETARCH2_RESULT ${GETARCH2_DIR} + SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH2_LOG + COPY_FILE ${GETARCH2_BIN} +) + +message(STATUS "getarch2 result ${GETARCH2_RESULT}") +message(STATUS "getarch2 log ${GETARCH2_LOG}") +# use the cmake binary w/ the -E param to run a shell command in a cross-platform way +execute_process(COMMAND ${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT) +execute_process(COMMAND ${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT) + +message(STATUS "GETARCH_2 results:\n${GETARCH2_MAKE_OUT}") +message(STATUS "GETARCH_2 cresults:\n${GETARCH2_CONF_OUT}") + +# append config data from getarch_2nd to the TARGET file +file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT}) # TODO: parse the MAKE variables from getarch/getarch2 (GETARCH_MAKE_OUT) into CMAKE vars # for now I temporarily hardcoded to get system.cmake working From 8c23965da381ef878a1f5fd81506b63df3083037 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 28 Jan 2015 22:57:44 -0600 Subject: [PATCH 015/257] prebuild.cmake now reads the output from getarch into CMake vars. --- cmake/prebuild.cmake | 30 ++++++++++++++++++++---------- driver/level2/CMakeLists.txt | 2 +- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index fa6621cd1..99ff0430a 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -83,12 +83,21 @@ message(STATUS "Running getarch") execute_process(COMMAND ${GETARCH_BIN} 0 OUTPUT_VARIABLE GETARCH_MAKE_OUT) execute_process(COMMAND ${GETARCH_BIN} 1 OUTPUT_VARIABLE GETARCH_CONF_OUT) -message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") +#message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") # append config data from getarch to the TARGET file file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT}) -## TODO: config.h is ready for getarch_2nd now, so compile that +# TODO: make this a function, the exact same code is used again with getarch2 +string(REGEX MATCHALL "[0-9_a-zA-Z]+=[0-9_a-zA-Z]+" GETARCH_RESULT_LIST "${GETARCH_MAKE_OUT}") +foreach (GETARCH_LINE ${GETARCH_RESULT_LIST}) + # split the line into var and value, then assign the value to a CMake var + string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}") + list(GET SPLIT_VAR 0 VAR_NAME) + list(GET SPLIT_VAR 1 VAR_VALUE) + set(${VAR_NAME} ${VAR_VALUE}) +endforeach () + set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH2_DIR}) @@ -99,20 +108,21 @@ try_compile(GETARCH2_RESULT ${GETARCH2_DIR} COPY_FILE ${GETARCH2_BIN} ) -message(STATUS "getarch2 result ${GETARCH2_RESULT}") -message(STATUS "getarch2 log ${GETARCH2_LOG}") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way execute_process(COMMAND ${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT) execute_process(COMMAND ${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT) -message(STATUS "GETARCH_2 results:\n${GETARCH2_MAKE_OUT}") -message(STATUS "GETARCH_2 cresults:\n${GETARCH2_CONF_OUT}") +#message(STATUS "GETARCH_2 results:\n${GETARCH2_MAKE_OUT}") # append config data from getarch_2nd to the TARGET file file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT}) -# TODO: parse the MAKE variables from getarch/getarch2 (GETARCH_MAKE_OUT) into CMAKE vars -# for now I temporarily hardcoded to get system.cmake working -set(NUM_CORES 4) -set(CORE "GENERIC") +string(REGEX MATCHALL "[0-9_a-zA-Z]+=[0-9_a-zA-Z]+" GETARCH_RESULT_LIST "${GETARCH2_MAKE_OUT}") +foreach (GETARCH_LINE ${GETARCH_RESULT_LIST}) + # split the line into var and value, then assign the value to a CMake var + string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}") + list(GET SPLIT_VAR 0 VAR_NAME) + list(GET SPLIT_VAR 1 VAR_VALUE) + set(${VAR_NAME} ${VAR_VALUE}) +endforeach () diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 990337fe1..c2119bfe1 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -82,7 +82,7 @@ set_target_properties(DBLAS_TU PROPERTIES COMPILE_DEFINITIONS TRANS) set(DBLAS_TARGETS DBLAS_NONE DBLAS_T DBLAS_L DBLAS_U DBLAS_TU) -foreach (${DBLAS_TARGET} ${DBLAS_TARGETS}) +foreach (DBLAS_TARGET ${DBLAS_TARGETS}) set_target_properties(${DBLAS_TARGET} PROPERTIES COMPILE_DEFINITIONS DOUBLE) endforeach () From dabaecb2bc7d536607abe2d9930636934c826150 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 29 Jan 2015 09:30:47 -0600 Subject: [PATCH 016/257] Moved getarch parsing code into a function. --- cmake/prebuild.cmake | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 99ff0430a..ad1a83912 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -65,6 +65,19 @@ endif () include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") +# Reads string from getarch into CMake vars. Format of getarch vars is VARNAME=VALUE +function(ParseGetArchVars GETARCH_IN) + string(REGEX MATCHALL "[0-9_a-zA-Z]+=[0-9_a-zA-Z]+" GETARCH_RESULT_LIST "${GETARCH_IN}") + foreach (GETARCH_LINE ${GETARCH_RESULT_LIST}) + # split the line into var and value, then assign the value to a CMake var + string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}") + list(GET SPLIT_VAR 0 VAR_NAME) + list(GET SPLIT_VAR 1 VAR_VALUE) + message(STATUS "Setting ${VAR_NAME} to ${VAR_VALUE}") + set(${VAR_NAME} ${VAR_VALUE} PARENT_SCOPE) + endforeach () +endfunction () + # compile getarch enable_language(ASM) set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") @@ -85,18 +98,9 @@ execute_process(COMMAND ${GETARCH_BIN} 1 OUTPUT_VARIABLE GETARCH_CONF_OUT) #message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") -# append config data from getarch to the TARGET file +# append config data from getarch to the TARGET file and read in CMake vars file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT}) - -# TODO: make this a function, the exact same code is used again with getarch2 -string(REGEX MATCHALL "[0-9_a-zA-Z]+=[0-9_a-zA-Z]+" GETARCH_RESULT_LIST "${GETARCH_MAKE_OUT}") -foreach (GETARCH_LINE ${GETARCH_RESULT_LIST}) - # split the line into var and value, then assign the value to a CMake var - string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}") - list(GET SPLIT_VAR 0 VAR_NAME) - list(GET SPLIT_VAR 1 VAR_VALUE) - set(${VAR_NAME} ${VAR_VALUE}) -endforeach () +ParseGetArchVars(${GETARCH_MAKE_OUT}) set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") @@ -112,17 +116,7 @@ try_compile(GETARCH2_RESULT ${GETARCH2_DIR} execute_process(COMMAND ${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT) execute_process(COMMAND ${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT) -#message(STATUS "GETARCH_2 results:\n${GETARCH2_MAKE_OUT}") - -# append config data from getarch_2nd to the TARGET file +# append config data from getarch_2nd to the TARGET file and read in CMake vars file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT}) - -string(REGEX MATCHALL "[0-9_a-zA-Z]+=[0-9_a-zA-Z]+" GETARCH_RESULT_LIST "${GETARCH2_MAKE_OUT}") -foreach (GETARCH_LINE ${GETARCH_RESULT_LIST}) - # split the line into var and value, then assign the value to a CMake var - string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}") - list(GET SPLIT_VAR 0 VAR_NAME) - list(GET SPLIT_VAR 1 VAR_VALUE) - set(${VAR_NAME} ${VAR_VALUE}) -endforeach () +ParseGetArchVars(${GETARCH2_MAKE_OUT}) From dbdca7bf0c9a8ae202653a1dab028abeabeab275 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 29 Jan 2015 22:53:11 -0600 Subject: [PATCH 017/257] Added first pass at driver/level3 Makefile conversion. Added a rather convoluted CMake function to find all combinations of a given list. This will be useful for the object files that are compiled multiple times with different combinations of preprocessor definitions. --- cmake/prebuild.cmake | 1 - driver/level3/CMakeLists.txt | 106 +++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 driver/level3/CMakeLists.txt diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index ad1a83912..60566e3f2 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -73,7 +73,6 @@ function(ParseGetArchVars GETARCH_IN) string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}") list(GET SPLIT_VAR 0 VAR_NAME) list(GET SPLIT_VAR 1 VAR_VALUE) - message(STATUS "Setting ${VAR_NAME} to ${VAR_VALUE}") set(${VAR_NAME} ${VAR_VALUE} PARENT_SCOPE) endforeach () endfunction () diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt new file mode 100644 index 000000000..df6445de4 --- /dev/null +++ b/driver/level3/CMakeLists.txt @@ -0,0 +1,106 @@ +include_directories(${CMAKE_SOURCE_DIR}) + +set(USE_GEMM3M 0) + +if (DEFINED ARCH) + if (${ARCH} STREQUAL "x86") + set(USE_GEMM3M 1) + endif () + + if (${ARCH} STREQUAL "x86_64") + set(USE_GEMM3M 1) + endif () + + if (${ARCH} STREQUAL "ia64") + set(USE_GEMM3M 1) + endif () + + if (${ARCH} STREQUAL "MIPS") + set(USE_GEMM3M 1) + endif () +endif () + +# N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa + +# loop through gemm.c defines +set(GEMM_DEFINES NN NT TN TT) +foreach (GEMM_DEFINE ${GEMM_DEFINES}) + add_library(GEMM_${GEMM_DEFINE}_OBJS OBJECT gemm.c) + set_target_properties(GEMM_${GEMM_DEFINE}_OBJS PROPERTIES COMPILE_DEFINITIONS ${GEMM_DEFINE}) +endforeach () + +# Returns all combinations of the input list, as a list with colon-separated combinations +# E.g. input of A B C returns A B C A:B A:C B:C +# N.B. The input is meant to be a list, and to past a list to a function in CMake you must quote it (e.g. AllCombinations("${LIST_VAR}")). +function(AllCombinations list_in) + list(LENGTH list_in list_count) + set(num_combos 1) + math(EXPR num_combos "${num_combos} << ${list_count}") + set(LIST_OUT "") + foreach (c RANGE ${num_combos}) + set(current_combo "") + # this is a little ridiculous just to iterate through a list w/ indices + math(EXPR last_list_index "${list_count} - 1") + foreach (list_index RANGE 0 ${last_list_index}) + math(EXPR bit "1 << ${list_index}") + math(EXPR combo_has_bit "${c} & ${bit}") + list(GET list_in ${list_index} list_elem) + if (combo_has_bit) + if (current_combo) + set(current_combo "${current_combo}:${list_elem}") + else () + set(current_combo ${list_elem}) + endif () + endif () + endforeach () + list(APPEND LIST_OUT ${current_combo}) + endforeach () + set(LIST_OUT ${LIST_OUT} PARENT_SCOPE) +endfunction () + +# these sources are compiled with combinations of TRANS, UPPER, and UNIT, for 32 combinations total +set(TRM_SOURCES trmm_L.c trmm_R.c trsm_L.c trsm_R.c) +AllCombinations("TRANS UPPER UNIT") +set(TRM_DEFINE_COMBOS LIST_OUT) +message(STATUS "alcombos result: ${LIST_OUT}") +foreach (TRM_SOURCE ${TRM_SOURCES}) + foreach (TRM_DEFINES ${TRM_DEFINE_COMBOS}) + string(REGEX MATCH "[a-z]+_[LR]" TRM_NAME ${TRM_SOURCE}) + string(TOUPPER ${TRM_NAME} TRM_NAME) + # TODO: TRM_DEFINES is a colon-separated list of defines to set for this object - need to parse it and set them using set_target_properties, and also come up with a unique id for the lib name (e.g. first letter of each define, so TRANS UPPER UNIT is TUU) + #add_library(${TRM_NAME}_${TRM_DEFINE}_OBJS OBJECT ${TRM_SOURCE}) + #set_target_properties(${TRM_NAME}_${TRM_DEFINE}_OBJS PROPERTIES COMPILE_DEFINITIONS ${TRM_DEFINE}) + endforeach () +endforeach () + +# dsymm_LU.c dsymm_LL.c dsymm_RU.c dsymm_RL.c +# dsyrk_UN.c dsyrk_UT.c dsyrk_LN.c dsyrk_LT.c +# dsyr2k_UN.c dsyr2k_UT.c dsyr2k_LN.c dsyr2k_LT.c +# dsyrk_kernel_U.c dsyrk_kernel_L.c +# dsyr2k_kernel_U.c dsyr2k_kernel_L.c + +#if (SMP) +# +# COMMONOBJS += gemm_thread_m.c gemm_thread_n.c gemm_thread_mn.c gemm_thread_variable.c +# COMMONOBJS += syrk_thread.c +# +# if (USE_SIMPLE_THREADED_LEVEL3) +# DBLASOBJS += dgemm_thread_nn.c dgemm_thread_nt.c dgemm_thread_tn.c dgemm_thread_tt.c +# DBLASOBJS += dsymm_thread_LU.c dsymm_thread_LL.c dsymm_thread_RU.c dsymm_thread_RL.c +# DBLASOBJS += dsyrk_thread_UN.c dsyrk_thread_UT.c dsyrk_thread_LN.c dsyrk_thread_LT.c +# +# endif () +#endif () +# +#HPLOBJS = +# dgemm_nn.c dgemm_nt.c dgemm_tn.c dgemm_tt.c +# dtrsm_LNUU.c dtrsm_LNUN.c dtrsm_LNLU.c dtrsm_LNLN.c +# dtrsm_LTUU.c dtrsm_LTUN.c dtrsm_LTLU.c dtrsm_LTLN.c +# dtrsm_RNUU.c dtrsm_RNUN.c dtrsm_RNLU.c dtrsm_RNLN.c +# dtrsm_RTUU.c dtrsm_RTUN.c dtrsm_RTLU.c dtrsm_RTLN.c +# +#if (USE_SIMPLE_THREADED_LEVEL3) +# HPLOBJS += dgemm_thread_nn.c dgemm_thread_nt.c +# dgemm_thread_tn.c dgemm_thread_tt.c +#endif +# From a6cf8aafc0ad973a0599c939ddcbc138f99a4669 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Fri, 30 Jan 2015 11:21:50 -0600 Subject: [PATCH 018/257] Updated level3/CMakeLists with correct defines using all combos. --- driver/level2/CMakeLists.txt | 175 +++++++++++++++++------------------ driver/level3/CMakeLists.txt | 35 +++++-- interface/CMakeLists.txt | 91 +++++++++--------- 3 files changed, 157 insertions(+), 144 deletions(-) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index c2119bfe1..ff6faab90 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -1,88 +1,87 @@ - -include_directories(${CMAKE_SOURCE_DIR}) - -# sources that need to be compiled twice, once with no flags and once with LOWER -set(UL_SOURCES - sbmv_k.c - spmv_k.c - spr_k.c - spr2_k.c - syr_k.c - syr2_k.c -) - -# sources that need to be compiled several times, for UNIT, TRANS -set(NU_SOURCES - tbmv_U.c - tbsv_U.c - tpmv_U.c - tpsv_U.c - trmv_U.c - trsv_U.c - tbmv_L.c - tbsv_L.c - tpmv_L.c - tpsv_L.c - trmv_L.c - trsv_L.c -) - -# first compile all the objects that don't need specific preprocessor defines -add_library(DBLAS_NONE OBJECT - gbmv_k.c # gbmv_N - ${UL_SOURCES} - ${NU_SOURCES} -) - -# then do objects with transpose/triangular/etc definitions - -# objects that need TRANS set -add_library(DBLAS_T OBJECT gbmv_k.c ${NU_SOURCES}) -set_target_properties(DBLAS_T PROPERTIES COMPILE_DEFINITIONS TRANS) - -# objects that need LOWER set -add_library(DBLAS_L OBJECT ${UL_SOURCES}) -set_target_properties(DBLAS_L PROPERTIES COMPILE_DEFINITIONS LOWER) - -# objects that need UNIT set -add_library(DBLAS_U OBJECT ${NU_SOURCES}) -set_target_properties(DBLAS_U PROPERTIES COMPILE_DEFINITIONS UNIT) - -# objects that need TRANS and UNIT set -add_library(DBLAS_TU OBJECT ${NU_SOURCES}) -set_target_properties(DBLAS_TU PROPERTIES COMPILE_DEFINITIONS UNIT) -set_target_properties(DBLAS_TU PROPERTIES COMPILE_DEFINITIONS TRANS) - -#if (DEFINED SMP) -# add_library(DBLASOBJS_SMP -# dgemv_thread_n.c dgemv_thread_t.c -# dger_thread.c -# dsymv_thread_U.c dsymv_thread_L.c -# dsyr_thread_U.c dsyr_thread_L.c -# dsyr2_thread_U.c dsyr2_thread_L.c -# dspr_thread_U.c dspr_thread_L.c -# dspr2_thread_U.c dspr2_thread_L.c -# dtrmv_thread_NUU.c dtrmv_thread_NUN.c -# dtrmv_thread_NLU.c dtrmv_thread_NLN.c -# dtrmv_thread_TUU.c dtrmv_thread_TUN.c -# dtrmv_thread_TLU.c dtrmv_thread_TLN.c -# dspmv_thread_U.c dspmv_thread_L.c -# dtpmv_thread_NUU.c dtpmv_thread_NUN.c -# dtpmv_thread_NLU.c dtpmv_thread_NLN.c -# dtpmv_thread_TUU.c dtpmv_thread_TUN.c -# dtpmv_thread_TLU.c dtpmv_thread_TLN.c -# dgbmv_thread_n.c dgbmv_thread_t.c -# dsbmv_thread_U.c dsbmv_thread_L.c -# dtbmv_thread_NUU.c dtbmv_thread_NUN.c -# dtbmv_thread_NLU.c dtbmv_thread_NLN.c -# dtbmv_thread_TUU.c dtbmv_thread_TUN.c -# dtbmv_thread_TLU.c dtbmv_thread_TLN.c -# ) -#endif () - -set(DBLAS_TARGETS DBLAS_NONE DBLAS_T DBLAS_L DBLAS_U DBLAS_TU) - -foreach (DBLAS_TARGET ${DBLAS_TARGETS}) - set_target_properties(${DBLAS_TARGET} PROPERTIES COMPILE_DEFINITIONS DOUBLE) -endforeach () - + +include_directories(${CMAKE_SOURCE_DIR}) + +# sources that need to be compiled twice, once with no flags and once with LOWER +set(UL_SOURCES + sbmv_k.c + spmv_k.c + spr_k.c + spr2_k.c + syr_k.c + syr2_k.c +) + +# sources that need to be compiled several times, for UNIT, TRANS +set(NU_SOURCES + tbmv_U.c + tbsv_U.c + tpmv_U.c + tpsv_U.c + trmv_U.c + trsv_U.c + tbmv_L.c + tbsv_L.c + tpmv_L.c + tpsv_L.c + trmv_L.c + trsv_L.c +) + +# first compile all the objects that don't need specific preprocessor defines +add_library(DBLAS_NONE OBJECT + gbmv_k.c # gbmv_N + ${UL_SOURCES} + ${NU_SOURCES} +) + +# then do objects with transpose/triangular/etc definitions + +# objects that need TRANS set +add_library(DBLAS_T OBJECT gbmv_k.c ${NU_SOURCES}) +set_target_properties(DBLAS_T PROPERTIES COMPILE_DEFINITIONS "TRANS") + +# objects that need LOWER set +add_library(DBLAS_L OBJECT ${UL_SOURCES}) +set_target_properties(DBLAS_L PROPERTIES COMPILE_DEFINITIONS "LOWER") + +# objects that need UNIT set +add_library(DBLAS_U OBJECT ${NU_SOURCES}) +set_target_properties(DBLAS_U PROPERTIES COMPILE_DEFINITIONS "UNIT") + +# objects that need TRANS and UNIT set +add_library(DBLAS_TU OBJECT ${NU_SOURCES}) +set_target_properties(DBLAS_TU PROPERTIES COMPILE_DEFINITIONS "UNIT;TRANS") + +#if (DEFINED SMP) +# add_library(DBLASOBJS_SMP +# dgemv_thread_n.c dgemv_thread_t.c +# dger_thread.c +# dsymv_thread_U.c dsymv_thread_L.c +# dsyr_thread_U.c dsyr_thread_L.c +# dsyr2_thread_U.c dsyr2_thread_L.c +# dspr_thread_U.c dspr_thread_L.c +# dspr2_thread_U.c dspr2_thread_L.c +# dtrmv_thread_NUU.c dtrmv_thread_NUN.c +# dtrmv_thread_NLU.c dtrmv_thread_NLN.c +# dtrmv_thread_TUU.c dtrmv_thread_TUN.c +# dtrmv_thread_TLU.c dtrmv_thread_TLN.c +# dspmv_thread_U.c dspmv_thread_L.c +# dtpmv_thread_NUU.c dtpmv_thread_NUN.c +# dtpmv_thread_NLU.c dtpmv_thread_NLN.c +# dtpmv_thread_TUU.c dtpmv_thread_TUN.c +# dtpmv_thread_TLU.c dtpmv_thread_TLN.c +# dgbmv_thread_n.c dgbmv_thread_t.c +# dsbmv_thread_U.c dsbmv_thread_L.c +# dtbmv_thread_NUU.c dtbmv_thread_NUN.c +# dtbmv_thread_NLU.c dtbmv_thread_NLN.c +# dtbmv_thread_TUU.c dtbmv_thread_TUN.c +# dtbmv_thread_TLU.c dtbmv_thread_TLN.c +# ) +#endif () + +set(DBLAS_TARGETS DBLAS_NONE DBLAS_T DBLAS_L DBLAS_U DBLAS_TU) + +foreach (DBLAS_TARGET ${DBLAS_TARGETS}) + set_target_properties(${DBLAS_TARGET} PROPERTIES COMPILE_DEFINITIONS DOUBLE) +endforeach () + diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index df6445de4..3a282a0ae 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -60,16 +60,31 @@ endfunction () # these sources are compiled with combinations of TRANS, UPPER, and UNIT, for 32 combinations total set(TRM_SOURCES trmm_L.c trmm_R.c trsm_L.c trsm_R.c) -AllCombinations("TRANS UPPER UNIT") -set(TRM_DEFINE_COMBOS LIST_OUT) -message(STATUS "alcombos result: ${LIST_OUT}") -foreach (TRM_SOURCE ${TRM_SOURCES}) - foreach (TRM_DEFINES ${TRM_DEFINE_COMBOS}) - string(REGEX MATCH "[a-z]+_[LR]" TRM_NAME ${TRM_SOURCE}) - string(TOUPPER ${TRM_NAME} TRM_NAME) - # TODO: TRM_DEFINES is a colon-separated list of defines to set for this object - need to parse it and set them using set_target_properties, and also come up with a unique id for the lib name (e.g. first letter of each define, so TRANS UPPER UNIT is TUU) - #add_library(${TRM_NAME}_${TRM_DEFINE}_OBJS OBJECT ${TRM_SOURCE}) - #set_target_properties(${TRM_NAME}_${TRM_DEFINE}_OBJS PROPERTIES COMPILE_DEFINITIONS ${TRM_DEFINE}) +AllCombinations("TRANS;UPPER;UNIT") +set(TRM_DEFINE_COMBOS ${LIST_OUT}) +foreach (trm_source ${TRM_SOURCES}) + foreach (trm_defines ${TRM_DEFINE_COMBOS}) + + # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with + string(REPLACE ":" ";" trm_defines ${trm_defines}) + + # build a unique variable name for this obj file by picking two letters from the defines (can't use one in this case) + set(trm_obj_name "") + foreach (trm_define ${trm_defines}) + string(REGEX MATCH "^[A-Z][A-Z]" letter ${trm_define}) + set(trm_obj_name "${trm_obj_name}${letter}") + endforeach () + + # parse file name + string(REGEX MATCH "[a-z]+_[LR]" trm_name ${trm_source}) + string(TOUPPER ${trm_name} trm_name) + + # prepend the uppercased file name to the obj name + set(trm_obj_name "${trm_name}_${trm_obj_name}_OBJS") + + # now add the object and set the defines + add_library(${trm_obj_name} OBJECT ${trm_source}) + set_target_properties(${trm_obj_name} PROPERTIES COMPILE_DEFINITIONS "${trm_defines}") endforeach () endforeach () diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 62a889f49..c38a73f84 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -1,46 +1,45 @@ - -include_directories(${CMAKE_SOURCE_DIR}) - -# TODO: Need to generate object files for S, D, C, Q and X - start with D for now. -# The sources are the same, but there are additional preprocessor definitions depending on the precision (see Makefile.tail). - -add_library(DBLAS1OBJS OBJECT - axpy.c swap.c - copy.c scal.c - dot.c - asum.c nrm2.c - max.c # amax/min/amin compiled later from same source - rot.c rotg.c rotm.c rotmg.c - axpby.c -) - -# N.B. The original Makefile passed in -UUSE_MIN and -UUSE_ABS (where appropriate), no way to do that at a source-level in cmake. REMOVE_DEFINITIONS removes a definition for the rest of the compilation. -add_library(AMAX_OBJ OBJECT max.c) -set_target_properties(AMAX_OBJ PROPERTIES COMPILE_DEFINITIONS USE_ABS) -add_library(AMIN_OBJ OBJECT max.c) -set_target_properties(AMIN_OBJ PROPERTIES COMPILE_DEFINITIONS USE_ABS) -set_target_properties(AMIN_OBJ PROPERTIES COMPILE_DEFINITIONS USE_MIN) -add_library(MIN_OBJ OBJECT max.c) -set_target_properties(MIN_OBJ PROPERTIES COMPILE_DEFINITIONS USE_MIN) - -# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f -add_library(DBLAS2OBJS OBJECT - gemv.c ger.c - trsv.c trmv.c symv.c - syr.c syr2.c gbmv.c - sbmv.c spmv.c - spr.c spr2.c - tbsv.c tbmv.c - tpsv.c tpmv.c -) - -add_library(DBLAS3OBJS OBJECT - gemm.c symm.c - trsm.c syrk.c syr2k.c - omatcopy.c imatcopy.c -) - -# trmm is trsm with a compiler flag set -add_library(TRMM_OBJ OBJECT trsm.c) -set_target_properties(TRMM_OBJ PROPERTIES COMPILE_DEFINITIONS TRMM) - + +include_directories(${CMAKE_SOURCE_DIR}) + +# TODO: Need to generate object files for S, D, C, Q and X - start with D for now. +# The sources are the same, but there are additional preprocessor definitions depending on the precision (see Makefile.tail). + +add_library(DBLAS1OBJS OBJECT + axpy.c swap.c + copy.c scal.c + dot.c + asum.c nrm2.c + max.c # amax/min/amin compiled later from same source + rot.c rotg.c rotm.c rotmg.c + axpby.c +) + +# N.B. The original Makefile passed in -UUSE_MIN and -UUSE_ABS (where appropriate), no way to do that at a source-level in cmake. REMOVE_DEFINITIONS removes a definition for the rest of the compilation. +add_library(AMAX_OBJ OBJECT max.c) +set_target_properties(AMAX_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_ABS") +add_library(AMIN_OBJ OBJECT max.c) +set_target_properties(AMIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_ABS;USE_MIN") +add_library(MIN_OBJ OBJECT max.c) +set_target_properties(MIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_MIN") + +# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f +add_library(DBLAS2OBJS OBJECT + gemv.c ger.c + trsv.c trmv.c symv.c + syr.c syr2.c gbmv.c + sbmv.c spmv.c + spr.c spr2.c + tbsv.c tbmv.c + tpsv.c tpmv.c +) + +add_library(DBLAS3OBJS OBJECT + gemm.c symm.c + trsm.c syrk.c syr2k.c + omatcopy.c imatcopy.c +) + +# trmm is trsm with a compiler flag set +add_library(TRMM_OBJ OBJECT trsm.c) +set_target_properties(TRMM_OBJ PROPERTIES COMPILE_DEFINITIONS TRMM) + From 8d9b196e0dd3f1230b3f6e610e6a54ead64b514f Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Fri, 30 Jan 2015 12:14:44 -0600 Subject: [PATCH 019/257] Moved loop over define combos into a function. This function takes a set of sources and a set of preprocessor definitions. It will iterate over the sources and build an object file for each combination of preprocessor definitions for each source file. --- driver/level3/CMakeLists.txt | 60 ++++++++++++++++++++---------------- driver/others/CMakeLists.txt | 2 ++ 2 files changed, 35 insertions(+), 27 deletions(-) create mode 100644 driver/others/CMakeLists.txt diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 3a282a0ae..9059a46d0 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -58,35 +58,41 @@ function(AllCombinations list_in) set(LIST_OUT ${LIST_OUT} PARENT_SCOPE) endfunction () -# these sources are compiled with combinations of TRANS, UPPER, and UNIT, for 32 combinations total -set(TRM_SOURCES trmm_L.c trmm_R.c trsm_L.c trsm_R.c) -AllCombinations("TRANS;UPPER;UNIT") -set(TRM_DEFINE_COMBOS ${LIST_OUT}) -foreach (trm_source ${TRM_SOURCES}) - foreach (trm_defines ${TRM_DEFINE_COMBOS}) - - # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with - string(REPLACE ":" ";" trm_defines ${trm_defines}) - - # build a unique variable name for this obj file by picking two letters from the defines (can't use one in this case) - set(trm_obj_name "") - foreach (trm_define ${trm_defines}) - string(REGEX MATCH "^[A-Z][A-Z]" letter ${trm_define}) - set(trm_obj_name "${trm_obj_name}${letter}") +# generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in +function(GenerateObjects sources_in defines_in) + AllCombinations("${defines_in}") + set(define_combos ${LIST_OUT}) + foreach (source_file ${sources_in}) + foreach (def_combo ${define_combos}) + + # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with + string(REPLACE ":" ";" def_combo ${def_combo}) + + # build a unique variable name for this obj file by picking two letters from the defines (can't use one in this case) + set(obj_name "") + foreach (combo_elem ${def_combo}) + string(REGEX MATCH "^[A-Z][A-Z]" letter ${combo_elem}) + set(obj_name "${obj_name}${letter}") + endforeach () + + # parse file name + string(REGEX MATCH "[a-z]+_[LR]" source_name ${source_file}) + string(TOUPPER ${source_name} source_name) + + # prepend the uppercased file name to the obj name + set(obj_name "${source_name}_${obj_name}_OBJS") + + # now add the object and set the defines + add_library(${obj_name} OBJECT ${source_file}) + set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${def_combo}") endforeach () - - # parse file name - string(REGEX MATCH "[a-z]+_[LR]" trm_name ${trm_source}) - string(TOUPPER ${trm_name} trm_name) - - # prepend the uppercased file name to the obj name - set(trm_obj_name "${trm_name}_${trm_obj_name}_OBJS") - - # now add the object and set the defines - add_library(${trm_obj_name} OBJECT ${trm_source}) - set_target_properties(${trm_obj_name} PROPERTIES COMPILE_DEFINITIONS "${trm_defines}") endforeach () -endforeach () +endfunction () + +# these sources are compiled with combinations of TRANS, UPPER, and UNIT, for 32 combinations total +set(TRM_SOURCES trmm_L.c trmm_R.c trsm_L.c trsm_R.c) +set(TRM_DEFINES TRANS UPPER UNIT) +GenerateObjects("${TRM_SOURCES}" "${TRM_DEFINES}") # dsymm_LU.c dsymm_LL.c dsymm_RU.c dsymm_RL.c # dsyrk_UN.c dsyrk_UT.c dsyrk_LN.c dsyrk_LT.c diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt new file mode 100644 index 000000000..2685d79c8 --- /dev/null +++ b/driver/others/CMakeLists.txt @@ -0,0 +1,2 @@ + +# NYI From 7693887d61cfe495ee37a8ed8dbb4eec54d0b3e9 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Fri, 30 Jan 2015 13:01:11 -0600 Subject: [PATCH 020/257] Added empty set to the combinations generated by AllCombinations. --- driver/level3/CMakeLists.txt | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 9059a46d0..37a9b1bd5 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -35,9 +35,10 @@ endforeach () function(AllCombinations list_in) list(LENGTH list_in list_count) set(num_combos 1) - math(EXPR num_combos "${num_combos} << ${list_count}") + # subtract 1 since we will iterate from 0 to num_combos + math(EXPR num_combos "(${num_combos} << ${list_count}) - 1") set(LIST_OUT "") - foreach (c RANGE ${num_combos}) + foreach (c RANGE 0 ${num_combos}) set(current_combo "") # this is a little ridiculous just to iterate through a list w/ indices math(EXPR last_list_index "${list_count} - 1") @@ -55,6 +56,7 @@ function(AllCombinations list_in) endforeach () list(APPEND LIST_OUT ${current_combo}) endforeach () + list(APPEND LIST_OUT " ") # Empty set is a valic combination, but CMake isn't appending the empty string for some reason, use a space set(LIST_OUT ${LIST_OUT} PARENT_SCOPE) endfunction () @@ -76,7 +78,7 @@ function(GenerateObjects sources_in defines_in) endforeach () # parse file name - string(REGEX MATCH "[a-z]+_[LR]" source_name ${source_file}) + string(REGEX MATCH "^[a-zA-Z_]+" source_name ${source_file}) string(TOUPPER ${source_name} source_name) # prepend the uppercased file name to the obj name @@ -84,7 +86,9 @@ function(GenerateObjects sources_in defines_in) # now add the object and set the defines add_library(${obj_name} OBJECT ${source_file}) - set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${def_combo}") + if (NOT "${def_combo}" STREQUAL " ") # using space as the empty set + set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${def_combo}") + endif () endforeach () endforeach () endfunction () @@ -94,6 +98,9 @@ set(TRM_SOURCES trmm_L.c trmm_R.c trsm_L.c trsm_R.c) set(TRM_DEFINES TRANS UPPER UNIT) GenerateObjects("${TRM_SOURCES}" "${TRM_DEFINES}") +# TODO: also need to set NN for all these objs (add param to GenerateObjects for defines that apply to all +GenerateObjects("symm_k.c" "LOWER;RSIDE") + # dsymm_LU.c dsymm_LL.c dsymm_RU.c dsymm_RL.c # dsyrk_UN.c dsyrk_UT.c dsyrk_LN.c dsyrk_LT.c # dsyr2k_UN.c dsyr2k_UT.c dsyr2k_LN.c dsyr2k_LT.c From e5e7595bf913d249c5dcd1e10cea0d472386ac49 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Fri, 30 Jan 2015 13:31:13 -0600 Subject: [PATCH 021/257] Added paramater to GenerateObjects for defines that affect all sources. --- driver/level3/CMakeLists.txt | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 37a9b1bd5..ef3695e2d 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -61,7 +61,10 @@ function(AllCombinations list_in) endfunction () # generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in -function(GenerateObjects sources_in defines_in) +# @param sources_in the source files to build from +# @param defines_in the preprocessor definitions that will be combined to create the object files +# @param all_defines_in (optional) preprocessor definitions that will be applied to all objects +function(GenerateObjects sources_in defines_in all_defines_in) AllCombinations("${defines_in}") set(define_combos ${LIST_OUT}) foreach (source_file ${sources_in}) @@ -78,7 +81,7 @@ function(GenerateObjects sources_in defines_in) endforeach () # parse file name - string(REGEX MATCH "^[a-zA-Z_]+" source_name ${source_file}) + string(REGEX MATCH "^[a-zA-Z_0-9]+" source_name ${source_file}) string(TOUPPER ${source_name} source_name) # prepend the uppercased file name to the obj name @@ -86,26 +89,23 @@ function(GenerateObjects sources_in defines_in) # now add the object and set the defines add_library(${obj_name} OBJECT ${source_file}) - if (NOT "${def_combo}" STREQUAL " ") # using space as the empty set - set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${def_combo}") + set(cur_defines ${def_combo}) + if ("${cur_defines}" STREQUAL " ") + set(cur_defines ${all_defines_in}) + else () + list(APPEND cur_defines ${all_defines_in}) + endif () + if (cur_defines AND NOT "${cur_defines}" STREQUAL " ") # using space as the empty set + set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${cur_defines}") endif () endforeach () endforeach () endfunction () -# these sources are compiled with combinations of TRANS, UPPER, and UNIT, for 32 combinations total -set(TRM_SOURCES trmm_L.c trmm_R.c trsm_L.c trsm_R.c) -set(TRM_DEFINES TRANS UPPER UNIT) -GenerateObjects("${TRM_SOURCES}" "${TRM_DEFINES}") - -# TODO: also need to set NN for all these objs (add param to GenerateObjects for defines that apply to all -GenerateObjects("symm_k.c" "LOWER;RSIDE") - -# dsymm_LU.c dsymm_LL.c dsymm_RU.c dsymm_RL.c -# dsyrk_UN.c dsyrk_UT.c dsyrk_LN.c dsyrk_LT.c -# dsyr2k_UN.c dsyr2k_UT.c dsyr2k_LN.c dsyr2k_LT.c -# dsyrk_kernel_U.c dsyrk_kernel_L.c -# dsyr2k_kernel_U.c dsyr2k_kernel_L.c +GenerateObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "DOUBLE") +GenerateObjects("symm_k.c" "LOWER;RSIDE" "NN;DOUBLE") +GenerateObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "DOUBLE") +GenerateObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "DOUBLE") #if (SMP) # From d3dcdddf7569eac76e580be123c7a59a74f6d81a Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Fri, 30 Jan 2015 13:47:40 -0600 Subject: [PATCH 022/257] Moved functions into util cmake file. --- CMakeLists.txt | 1 + cmake/prebuild.cmake | 12 ----- cmake/utils.cmake | 87 ++++++++++++++++++++++++++++++++++++ driver/level3/CMakeLists.txt | 73 ------------------------------ 4 files changed, 88 insertions(+), 85 deletions(-) create mode 100644 cmake/utils.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 25b88d565..be52d9713 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ project(OpenBLAS) message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with.") +include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake") include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") set(BLASDIRS interface driver/level2 driver/level3 driver/others) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 60566e3f2..9595dab0d 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -65,18 +65,6 @@ endif () include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") -# Reads string from getarch into CMake vars. Format of getarch vars is VARNAME=VALUE -function(ParseGetArchVars GETARCH_IN) - string(REGEX MATCHALL "[0-9_a-zA-Z]+=[0-9_a-zA-Z]+" GETARCH_RESULT_LIST "${GETARCH_IN}") - foreach (GETARCH_LINE ${GETARCH_RESULT_LIST}) - # split the line into var and value, then assign the value to a CMake var - string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}") - list(GET SPLIT_VAR 0 VAR_NAME) - list(GET SPLIT_VAR 1 VAR_VALUE) - set(${VAR_NAME} ${VAR_VALUE} PARENT_SCOPE) - endforeach () -endfunction () - # compile getarch enable_language(ASM) set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") diff --git a/cmake/utils.cmake b/cmake/utils.cmake new file mode 100644 index 000000000..a95695553 --- /dev/null +++ b/cmake/utils.cmake @@ -0,0 +1,87 @@ +# Functions to help with the OpenBLAS build + +# Reads string from getarch into CMake vars. Format of getarch vars is VARNAME=VALUE +function(ParseGetArchVars GETARCH_IN) + string(REGEX MATCHALL "[0-9_a-zA-Z]+=[0-9_a-zA-Z]+" GETARCH_RESULT_LIST "${GETARCH_IN}") + foreach (GETARCH_LINE ${GETARCH_RESULT_LIST}) + # split the line into var and value, then assign the value to a CMake var + string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}") + list(GET SPLIT_VAR 0 VAR_NAME) + list(GET SPLIT_VAR 1 VAR_VALUE) + set(${VAR_NAME} ${VAR_VALUE} PARENT_SCOPE) + endforeach () +endfunction () + +# Returns all combinations of the input list, as a list with colon-separated combinations +# E.g. input of A B C returns A B C A:B A:C B:C +# N.B. The input is meant to be a list, and to past a list to a function in CMake you must quote it (e.g. AllCombinations("${LIST_VAR}")). +function(AllCombinations list_in) + list(LENGTH list_in list_count) + set(num_combos 1) + # subtract 1 since we will iterate from 0 to num_combos + math(EXPR num_combos "(${num_combos} << ${list_count}) - 1") + set(LIST_OUT "") + foreach (c RANGE 0 ${num_combos}) + set(current_combo "") + # this is a little ridiculous just to iterate through a list w/ indices + math(EXPR last_list_index "${list_count} - 1") + foreach (list_index RANGE 0 ${last_list_index}) + math(EXPR bit "1 << ${list_index}") + math(EXPR combo_has_bit "${c} & ${bit}") + list(GET list_in ${list_index} list_elem) + if (combo_has_bit) + if (current_combo) + set(current_combo "${current_combo}:${list_elem}") + else () + set(current_combo ${list_elem}) + endif () + endif () + endforeach () + list(APPEND LIST_OUT ${current_combo}) + endforeach () + list(APPEND LIST_OUT " ") # Empty set is a valic combination, but CMake isn't appending the empty string for some reason, use a space + set(LIST_OUT ${LIST_OUT} PARENT_SCOPE) +endfunction () + +# generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in +# @param sources_in the source files to build from +# @param defines_in the preprocessor definitions that will be combined to create the object files +# @param all_defines_in (optional) preprocessor definitions that will be applied to all objects +function(GenerateObjects sources_in defines_in all_defines_in) + AllCombinations("${defines_in}") + set(define_combos ${LIST_OUT}) + foreach (source_file ${sources_in}) + foreach (def_combo ${define_combos}) + + # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with + string(REPLACE ":" ";" def_combo ${def_combo}) + + # build a unique variable name for this obj file by picking two letters from the defines (can't use one in this case) + set(obj_name "") + foreach (combo_elem ${def_combo}) + string(REGEX MATCH "^[A-Z][A-Z]" letter ${combo_elem}) + set(obj_name "${obj_name}${letter}") + endforeach () + + # parse file name + string(REGEX MATCH "^[a-zA-Z_0-9]+" source_name ${source_file}) + string(TOUPPER ${source_name} source_name) + + # prepend the uppercased file name to the obj name + set(obj_name "${source_name}_${obj_name}_OBJS") + + # now add the object and set the defines + add_library(${obj_name} OBJECT ${source_file}) + set(cur_defines ${def_combo}) + if ("${cur_defines}" STREQUAL " ") + set(cur_defines ${all_defines_in}) + else () + list(APPEND cur_defines ${all_defines_in}) + endif () + if (cur_defines AND NOT "${cur_defines}" STREQUAL " ") # using space as the empty set + set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${cur_defines}") + endif () + endforeach () + endforeach () +endfunction () + diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index ef3695e2d..2b5c18007 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -29,79 +29,6 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) set_target_properties(GEMM_${GEMM_DEFINE}_OBJS PROPERTIES COMPILE_DEFINITIONS ${GEMM_DEFINE}) endforeach () -# Returns all combinations of the input list, as a list with colon-separated combinations -# E.g. input of A B C returns A B C A:B A:C B:C -# N.B. The input is meant to be a list, and to past a list to a function in CMake you must quote it (e.g. AllCombinations("${LIST_VAR}")). -function(AllCombinations list_in) - list(LENGTH list_in list_count) - set(num_combos 1) - # subtract 1 since we will iterate from 0 to num_combos - math(EXPR num_combos "(${num_combos} << ${list_count}) - 1") - set(LIST_OUT "") - foreach (c RANGE 0 ${num_combos}) - set(current_combo "") - # this is a little ridiculous just to iterate through a list w/ indices - math(EXPR last_list_index "${list_count} - 1") - foreach (list_index RANGE 0 ${last_list_index}) - math(EXPR bit "1 << ${list_index}") - math(EXPR combo_has_bit "${c} & ${bit}") - list(GET list_in ${list_index} list_elem) - if (combo_has_bit) - if (current_combo) - set(current_combo "${current_combo}:${list_elem}") - else () - set(current_combo ${list_elem}) - endif () - endif () - endforeach () - list(APPEND LIST_OUT ${current_combo}) - endforeach () - list(APPEND LIST_OUT " ") # Empty set is a valic combination, but CMake isn't appending the empty string for some reason, use a space - set(LIST_OUT ${LIST_OUT} PARENT_SCOPE) -endfunction () - -# generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in -# @param sources_in the source files to build from -# @param defines_in the preprocessor definitions that will be combined to create the object files -# @param all_defines_in (optional) preprocessor definitions that will be applied to all objects -function(GenerateObjects sources_in defines_in all_defines_in) - AllCombinations("${defines_in}") - set(define_combos ${LIST_OUT}) - foreach (source_file ${sources_in}) - foreach (def_combo ${define_combos}) - - # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with - string(REPLACE ":" ";" def_combo ${def_combo}) - - # build a unique variable name for this obj file by picking two letters from the defines (can't use one in this case) - set(obj_name "") - foreach (combo_elem ${def_combo}) - string(REGEX MATCH "^[A-Z][A-Z]" letter ${combo_elem}) - set(obj_name "${obj_name}${letter}") - endforeach () - - # parse file name - string(REGEX MATCH "^[a-zA-Z_0-9]+" source_name ${source_file}) - string(TOUPPER ${source_name} source_name) - - # prepend the uppercased file name to the obj name - set(obj_name "${source_name}_${obj_name}_OBJS") - - # now add the object and set the defines - add_library(${obj_name} OBJECT ${source_file}) - set(cur_defines ${def_combo}) - if ("${cur_defines}" STREQUAL " ") - set(cur_defines ${all_defines_in}) - else () - list(APPEND cur_defines ${all_defines_in}) - endif () - if (cur_defines AND NOT "${cur_defines}" STREQUAL " ") # using space as the empty set - set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${cur_defines}") - endif () - endforeach () - endforeach () -endfunction () - GenerateObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "DOUBLE") GenerateObjects("symm_k.c" "LOWER;RSIDE" "NN;DOUBLE") GenerateObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "DOUBLE") From 3e8ea7a351fa3903dc64340a3ed4597829182ce0 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Fri, 30 Jan 2015 14:06:14 -0600 Subject: [PATCH 023/257] Added COMMONOBJS to driver/others CMakeLists.txt. --- driver/others/CMakeLists.txt | 74 +++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 2685d79c8..57b551a75 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -1,2 +1,72 @@ - -# NYI +include_directories(${CMAKE_SOURCE_DIR}) + +if (${CORE} STREQUAL "PPC440") + set(MEMORY memory_qalloc.c) +else () + set(MEMORY memory.c) +endif () + +add_library(COMMONOBJS OBJECT + ${MEMORY} + xerbla.c + abs.c # TODO: this is split into c_abs (DOUBLE unset) and z_abs (DOUBLE set) in the Makefile + openblas_set_num_threads.c + openblas_get_config.c + openblas_get_parallel.c + openblas_error_handle.c +) + +#ifdef SMP +#COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) +#ifndef NO_AFFINITY +#COMMONOBJS += init.$(SUFFIX) +#endif +#endif +# +#ifeq ($(DYNAMIC_ARCH), 1) +#COMMONOBJS += dynamic.$(SUFFIX) +#else +#COMMONOBJS += parameter.$(SUFFIX) +#endif +# +#ifdef EXPRECISION +#COMMONOBJS += x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX) +#endif +# +#ifdef QUAD_PRECISION +#COMMONOBJS += addx.$(SUFFIX) mulx.$(SUFFIX) +#endif +# +#ifdef USE_CUDA +#COMMONOBJS += cuda_init.$(SUFFIX) +#endif +# +#ifdef FUNCTION_PROFILE +#COMMONOBJS += profile.$(SUFFIX) +#endif +# +#LIBOTHERS = libothers.$(LIBSUFFIX) +# +#ifeq ($(USE_OPENMP), 1) +#BLAS_SERVER = blas_server_omp.c +#else +#ifeq ($(OSNAME), WINNT) +#BLAS_SERVER = blas_server_win32.c +#endif +#ifeq ($(OSNAME), CYGWIN_NT) +#BLAS_SERVER = blas_server_win32.c +#endif +#ifeq ($(OSNAME), Interix) +#BLAS_SERVER = blas_server_win32.c +#endif +#endif +# +#ifndef BLAS_SERVER +#BLAS_SERVER = blas_server.c +#endif +# +#ifeq ($(DYNAMIC_ARCH), 1) +#HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) +#else +#HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) +#endif From 5057a4b4dfc29f40bff0b025ff02f301ffaabdcd Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Fri, 30 Jan 2015 15:21:21 -0600 Subject: [PATCH 024/257] Added openblas add_library call that uses DBLAS_OBJS ojbects. --- CMakeLists.txt | 8 ++++++++ cmake/utils.cmake | 3 +++ driver/level2/CMakeLists.txt | 7 ++----- driver/level3/CMakeLists.txt | 9 +++++++++ driver/others/CMakeLists.txt | 2 +- interface/CMakeLists.txt | 5 ++++- 6 files changed, 27 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index be52d9713..246ad3097 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,10 +53,18 @@ if (${NO_STATIC} AND ${NO_SHARED}) message(FATAL_ERROR "Neither static nor shared are enabled.") endif () +set(DBLAS_OBJS "") foreach (BLAS_DIR ${BLASDIRS}) add_subdirectory(${BLAS_DIR}) endforeach () +# get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) +set(TARGET_OBJS "") +foreach (DBLAS_OBJ ${DBLAS_OBJS}) + list(APPEND TARGET_OBJS "$") +endforeach () +add_library(openblas ${TARGET_OBJS}) + #Save the config files for installation # @cp Makefile.conf Makefile.conf_last # @cp config.h config_last.h diff --git a/cmake/utils.cmake b/cmake/utils.cmake index a95695553..944e24cc4 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -50,6 +50,7 @@ endfunction () function(GenerateObjects sources_in defines_in all_defines_in) AllCombinations("${defines_in}") set(define_combos ${LIST_OUT}) + set(OBJ_LIST_OUT "") foreach (source_file ${sources_in}) foreach (def_combo ${define_combos}) @@ -81,7 +82,9 @@ function(GenerateObjects sources_in defines_in all_defines_in) if (cur_defines AND NOT "${cur_defines}" STREQUAL " ") # using space as the empty set set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${cur_defines}") endif () + list(APPEND OBJ_LIST_OUT ${obj_name}) endforeach () endforeach () + set(OBJ_LIST_OUT ${OBJ_LIST_OUT} PARENT_SCOPE) endfunction () diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index ff6faab90..1fbf7c729 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -79,9 +79,6 @@ set_target_properties(DBLAS_TU PROPERTIES COMPILE_DEFINITIONS "UNIT;TRANS") # ) #endif () -set(DBLAS_TARGETS DBLAS_NONE DBLAS_T DBLAS_L DBLAS_U DBLAS_TU) - -foreach (DBLAS_TARGET ${DBLAS_TARGETS}) - set_target_properties(${DBLAS_TARGET} PROPERTIES COMPILE_DEFINITIONS DOUBLE) -endforeach () +list(APPEND DBLAS_OBJS "DBLAS_NONE;DBLAS_T;DBLAS_L;DBLAS_U;DBLAS_TU") +set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 2b5c18007..c6f008baa 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -27,12 +27,18 @@ set(GEMM_DEFINES NN NT TN TT) foreach (GEMM_DEFINE ${GEMM_DEFINES}) add_library(GEMM_${GEMM_DEFINE}_OBJS OBJECT gemm.c) set_target_properties(GEMM_${GEMM_DEFINE}_OBJS PROPERTIES COMPILE_DEFINITIONS ${GEMM_DEFINE}) + list(APPEND DBLAS_OBJS GEMM_${GEMM_DEFINE}_OBJS) endforeach () + GenerateObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "DOUBLE") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateObjects("symm_k.c" "LOWER;RSIDE" "NN;DOUBLE") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "DOUBLE") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "DOUBLE") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) #if (SMP) # @@ -59,3 +65,6 @@ GenerateObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "DOUBLE") # dgemm_thread_tn.c dgemm_thread_tt.c #endif # + +set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS + diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 57b551a75..10af485e9 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -6,7 +6,7 @@ else () set(MEMORY memory.c) endif () -add_library(COMMONOBJS OBJECT +add_library(COMMON_OBJS OBJECT ${MEMORY} xerbla.c abs.c # TODO: this is split into c_abs (DOUBLE unset) and z_abs (DOUBLE set) in the Makefile diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index c38a73f84..c8ea1cad6 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -41,5 +41,8 @@ add_library(DBLAS3OBJS OBJECT # trmm is trsm with a compiler flag set add_library(TRMM_OBJ OBJECT trsm.c) -set_target_properties(TRMM_OBJ PROPERTIES COMPILE_DEFINITIONS TRMM) +set_target_properties(TRMM_OBJ PROPERTIES COMPILE_DEFINITIONS "TRMM") + +list(APPEND DBLAS_OBJS "DBLAS1OBJS;AMAX_OBJ;AMIN_OBJ;MIN_OBJ;DBLAS2OBJS;DBLAS3OBJS;TRMM_OBJ") +set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From 9e154aba58cb7efa00af1bfd4331ba22c02f9ce6 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 2 Feb 2015 12:31:15 -0600 Subject: [PATCH 025/257] Added LAPACK object files to interface CMakeLists. --- interface/CMakeLists.txt | 98 ++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index c8ea1cad6..79b3b3c09 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -1,48 +1,60 @@ include_directories(${CMAKE_SOURCE_DIR}) -# TODO: Need to generate object files for S, D, C, Q and X - start with D for now. -# The sources are the same, but there are additional preprocessor definitions depending on the precision (see Makefile.tail). - -add_library(DBLAS1OBJS OBJECT - axpy.c swap.c - copy.c scal.c - dot.c - asum.c nrm2.c - max.c # amax/min/amin compiled later from same source - rot.c rotg.c rotm.c rotmg.c - axpby.c -) - -# N.B. The original Makefile passed in -UUSE_MIN and -UUSE_ABS (where appropriate), no way to do that at a source-level in cmake. REMOVE_DEFINITIONS removes a definition for the rest of the compilation. -add_library(AMAX_OBJ OBJECT max.c) -set_target_properties(AMAX_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_ABS") -add_library(AMIN_OBJ OBJECT max.c) -set_target_properties(AMIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_ABS;USE_MIN") -add_library(MIN_OBJ OBJECT max.c) -set_target_properties(MIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_MIN") - -# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f -add_library(DBLAS2OBJS OBJECT - gemv.c ger.c - trsv.c trmv.c symv.c - syr.c syr2.c gbmv.c - sbmv.c spmv.c - spr.c spr2.c - tbsv.c tbmv.c - tpsv.c tpmv.c -) - -add_library(DBLAS3OBJS OBJECT - gemm.c symm.c - trsm.c syrk.c syr2k.c - omatcopy.c imatcopy.c -) - -# trmm is trsm with a compiler flag set -add_library(TRMM_OBJ OBJECT trsm.c) -set_target_properties(TRMM_OBJ PROPERTIES COMPILE_DEFINITIONS "TRMM") - -list(APPEND DBLAS_OBJS "DBLAS1OBJS;AMAX_OBJ;AMIN_OBJ;MIN_OBJ;DBLAS2OBJS;DBLAS3OBJS;TRMM_OBJ") +if (NOT DEFINED NO_CBLAS) + # TODO: Need to generate object files for S, D, C, Q and X - start with D for now. + # The sources are the same, but there are additional preprocessor definitions depending on the precision (see Makefile.tail). + + add_library(DBLAS1OBJS OBJECT + axpy.c swap.c + copy.c scal.c + dot.c + asum.c nrm2.c + max.c # amax/min/amin compiled later from same source + rot.c rotg.c rotm.c rotmg.c + axpby.c + ) + + # N.B. The original Makefile passed in -UUSE_MIN and -UUSE_ABS (where appropriate), no way to do that at a source-level in cmake. REMOVE_DEFINITIONS removes a definition for the rest of the compilation. + add_library(AMAX_OBJ OBJECT max.c) + set_target_properties(AMAX_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_ABS") + add_library(AMIN_OBJ OBJECT max.c) + set_target_properties(AMIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_ABS;USE_MIN") + add_library(MIN_OBJ OBJECT max.c) + set_target_properties(MIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_MIN") + + # TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f + add_library(DBLAS2OBJS OBJECT + gemv.c ger.c + trsv.c trmv.c symv.c + syr.c syr2.c gbmv.c + sbmv.c spmv.c + spr.c spr2.c + tbsv.c tbmv.c + tpsv.c tpmv.c + ) + + add_library(DBLAS3OBJS OBJECT + gemm.c symm.c + trsm.c syrk.c syr2k.c + omatcopy.c imatcopy.c + ) + + # trmm is trsm with a compiler flag set + add_library(TRMM_OBJ OBJECT trsm.c) + set_target_properties(TRMM_OBJ PROPERTIES COMPILE_DEFINITIONS "TRMM") + + list(APPEND DBLAS_OBJS "DBLAS1OBJS;DBLAS2OBJS;DBLAS3OBJS;AMAX_OBJ;AMIN_OBJ;MIN_OBJ;TRMM_OBJ") +endif () + +if (NOT DEFINED NO_LAPACK) + add_library(DLAPACK_OBJS OBJECT + lapack/getrf.c lapack/getrs.c lapack/potrf.c lapack/getf2.c + lapack/potf2.c lapack/laswp.c lapack/gesv.c lapack/lauu2.c + lapack/lauum.c lapack/trti2.c lapack/trtri.c + ) + list(APPEND DBLAS_OBJS "DLAPACK_OBJS") +endif () + set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From d11bde60d0ec4f2d597cd5493aad4f8e44b5ecff Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 2 Feb 2015 15:00:44 -0600 Subject: [PATCH 026/257] DOUBLE define for DBLAS objects is now set in main CMakeLists.txt. Since the objects are the same, could generate SINGLE/COMPLEX/etc here without having to rewrite all the object enumeration code again. --- CMakeLists.txt | 5 ++++- driver/level3/CMakeLists.txt | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 246ad3097..bf1563d0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,7 @@ set(SUBDIRS_ALL ${SUBDIRS} test ctest utest exports benchmark ../laswp ../bench) # all :: libs netlib tests shared -# libs: +# libs : if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.") endif () @@ -61,10 +61,13 @@ endforeach () # get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) set(TARGET_OBJS "") foreach (DBLAS_OBJ ${DBLAS_OBJS}) + get_target_property(PREV_DEFS ${DBLAS_OBJ} COMPILE_DEFINITIONS) + set_target_properties(${DBLAS_OBJ} PROPERTIES COMPILE_DEFINITIONS "${PREV_DEFS};DOUBLE") list(APPEND TARGET_OBJS "$") endforeach () add_library(openblas ${TARGET_OBJS}) +# TODO: Why is the config saved here? Is this necessary with CMake? #Save the config files for installation # @cp Makefile.conf Makefile.conf_last # @cp config.h config_last.h diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index c6f008baa..57865d18b 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -31,13 +31,13 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) endforeach () -GenerateObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "DOUBLE") +GenerateObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateObjects("symm_k.c" "LOWER;RSIDE" "NN;DOUBLE") +GenerateObjects("symm_k.c" "LOWER;RSIDE" "NN") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "DOUBLE") +GenerateObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "DOUBLE") +GenerateObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) #if (SMP) From 7194424fef52a7f93d2fd0ae5e5de8488749e7e6 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 2 Feb 2015 15:21:29 -0600 Subject: [PATCH 027/257] Added missing common objects to the library. --- driver/others/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 10af485e9..a28cf1e79 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -44,7 +44,9 @@ add_library(COMMON_OBJS OBJECT #ifdef FUNCTION_PROFILE #COMMONOBJS += profile.$(SUFFIX) #endif -# + +list(APPEND DBLAS_OBJS "COMMON_OBJS") + #LIBOTHERS = libothers.$(LIBSUFFIX) # #ifeq ($(USE_OPENMP), 1) @@ -70,3 +72,6 @@ add_library(COMMON_OBJS OBJECT #else #HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) #endif + +set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS + From 20e593a44ae02882119991f9b3e7fd493f8ab6e1 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 2 Feb 2015 16:25:30 -0600 Subject: [PATCH 028/257] Added cblas_ objects to interface CMakeLists. Naming isn't right, though, not seeing cblas_xxxx exports in the resulting library. --- interface/CMakeLists.txt | 83 +++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 79b3b3c09..6ef498cb5 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -1,19 +1,34 @@ include_directories(${CMAKE_SOURCE_DIR}) -if (NOT DEFINED NO_CBLAS) - # TODO: Need to generate object files for S, D, C, Q and X - start with D for now. - # The sources are the same, but there are additional preprocessor definitions depending on the precision (see Makefile.tail). - - add_library(DBLAS1OBJS OBJECT - axpy.c swap.c - copy.c scal.c - dot.c - asum.c nrm2.c - max.c # amax/min/amin compiled later from same source - rot.c rotg.c rotm.c rotmg.c - axpby.c - ) +set(BLAS1_SOURCES + axpy.c swap.c + copy.c scal.c + dot.c + asum.c nrm2.c + rot.c rotg.c rotm.c rotmg.c + axpby.c +) + +# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f +set(BLAS2_SOURCES + gemv.c ger.c + trsv.c trmv.c symv.c + syr.c syr2.c gbmv.c + sbmv.c spmv.c + spr.c spr2.c + tbsv.c tbmv.c + tpsv.c tpmv.c +) + +set(BLAS3_SOURCES + gemm.c symm.c + trsm.c syrk.c syr2k.c + omatcopy.c imatcopy.c +) + + +if (NOT DEFINED NO_FBLAS) # N.B. The original Makefile passed in -UUSE_MIN and -UUSE_ABS (where appropriate), no way to do that at a source-level in cmake. REMOVE_DEFINITIONS removes a definition for the rest of the compilation. add_library(AMAX_OBJ OBJECT max.c) @@ -22,29 +37,37 @@ if (NOT DEFINED NO_CBLAS) set_target_properties(AMIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_ABS;USE_MIN") add_library(MIN_OBJ OBJECT max.c) set_target_properties(MIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_MIN") + add_library(MAX_OBJ OBJECT max.c) - # TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f - add_library(DBLAS2OBJS OBJECT - gemv.c ger.c - trsv.c trmv.c symv.c - syr.c syr2.c gbmv.c - sbmv.c spmv.c - spr.c spr2.c - tbsv.c tbmv.c - tpsv.c tpmv.c - ) - - add_library(DBLAS3OBJS OBJECT - gemm.c symm.c - trsm.c syrk.c syr2k.c - omatcopy.c imatcopy.c - ) + add_library(DBLAS1OBJS OBJECT ${BLAS1_SOURCES}) + add_library(DBLAS2OBJS OBJECT ${BLAS2_SOURCES}) + add_library(DBLAS3OBJS OBJECT ${BLAS3_SOURCES}) # trmm is trsm with a compiler flag set add_library(TRMM_OBJ OBJECT trsm.c) set_target_properties(TRMM_OBJ PROPERTIES COMPILE_DEFINITIONS "TRMM") - list(APPEND DBLAS_OBJS "DBLAS1OBJS;DBLAS2OBJS;DBLAS3OBJS;AMAX_OBJ;AMIN_OBJ;MIN_OBJ;TRMM_OBJ") + list(APPEND DBLAS_OBJS "DBLAS1OBJS;DBLAS2OBJS;DBLAS3OBJS;AMAX_OBJ;AMIN_OBJ;MIN_OBJ;MAX_OBJ;TRMM_OBJ") +endif () + +if (NOT DEFINED NO_CBLAS) + + add_library(ISAMAX_OBJ OBJECT imax.c) + set_target_properties(ISAMAX_OBJ PROPERTIES COMPILE_DEFINITIONS "CBLAS;USE_ABS") + + add_library(CDBLAS1_OBJS OBJECT ${BLAS1_SOURCES}) + add_library(CDBLAS2_OBJS OBJECT ${BLAS2_SOURCES}) + add_library(CDBLAS3_OBJS OBJECT ${BLAS3_SOURCES}) + + # trmm is trsm with a compiler flag set + add_library(CTRMM_OBJ OBJECT trsm.c) + set_target_properties(CTRMM_OBJ PROPERTIES COMPILE_DEFINITIONS "CBLAS;TRMM") + + set_target_properties(CDBLAS1_OBJS PROPERTIES COMPILE_DEFINITIONS "CBLAS") + set_target_properties(CDBLAS2_OBJS PROPERTIES COMPILE_DEFINITIONS "CBLAS") + set_target_properties(CDBLAS3_OBJS PROPERTIES COMPILE_DEFINITIONS "CBLAS") + + list(APPEND DBLAS_OBJS "CDBLAS1_OBJS;CDBLAS2_OBJS;CDBLAS3_OBJS;ISAMAX_OBJ;CTRMM_OBJ") endif () if (NOT DEFINED NO_LAPACK) From 31cf22cb4b8b7bbff1d69e1f4d2928002a2dc727 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 11:07:58 -0600 Subject: [PATCH 029/257] Ported OS settings from Makefile.system into new cmake file. --- cmake/os.cmake | 104 +++++++++++++++++++++++++++++++++++++++++++++ cmake/system.cmake | 6 ++- 2 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 cmake/os.cmake diff --git a/cmake/os.cmake b/cmake/os.cmake new file mode 100644 index 000000000..d897a2506 --- /dev/null +++ b/cmake/os.cmake @@ -0,0 +1,104 @@ +## +## Author: Hank Anderson +## Created: 12/29/14 +## Last Modified: 12/29/14 +## Description: Ported from portion of OpenBLAS/Makefile.system +## Detects the OS and sets appropriate variables. + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") + set(ENV{MACOSX_DEPLOYMENT_TARGET} "10.2") # TODO: should be exported as an env var + set(MD5SUM "md5 -r") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "FreeBSD") + set(MD5SUM "md5 -r") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "NetBSD") + set(MD5SUM "md5 -n") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + set(EXTRALIB "${EXTRALIB} -lm") + set(NO_EXPRECISION 1) +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") + set(EXTRALIB "${EXTRALIB} -lm") +endif () + +# TODO: this is probably meant for mingw, not other windows compilers +if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + + set(NEED_PIC 0) + set(NO_EXPRECISION 1) + + set(EXTRALIB "${EXTRALIB} -defaultlib:advapi32") + + # probably not going to use these + set(SUFFIX "obj") + set(PSUFFIX "pobj") + set(LIBSUFFIX "a") + + if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI") + endif () + + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + + # Test for supporting MS_ABI + # removed string parsing in favor of CMake's version comparison -hpa + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + # GCC Version >=4.7 + # It is compatible with MSVC ABI. + set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI") + endif () + endif () + + # Ensure the correct stack alignment on Win32 + # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 + if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86") + set(CCOMMON_OPT "${CCOMMON_OPT} -mincoming-stack-boundary=2") + set(FCOMMON_OPT "${FCOMMON_OPT} -mincoming-stack-boundary=2") + endif () + +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Interix") + set(NEED_PIC 0) + set(NO_EXPRECISION 1) + + set(INTERIX_TOOL_DIR STREQUAL "/opt/gcc.3.3/i586-pc-interix3/bin") +endif () + +if (CYGWIN) + set(NEED_PIC 0) + set(NO_EXPRECISION 1) +endif + +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") + if (SMP) + set(EXTRALIB "${EXTRALIB} -lpthread") + endif () +endif () + +if (QUAD_PRECISION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DQUAD_PRECISION") + set(NO_EXPRECISION 1) +endif () + +if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86") + set(NO_EXPRECISION 1) +endif () + +if (UTEST_CHECK) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK") + set(SANITY_CHECK 1) +endif () + +if (SANITY_CHECK) + # TODO: need some way to get $(*F) (target filename) + set(CCOMMON_OPT "${CCOMMON_OPT} -DSANITY_CHECK -DREFNAME=$(*F)f${BU}") +endif + diff --git a/cmake/system.cmake b/cmake/system.cmake index 0753ed028..1d9c4612d 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -128,5 +128,9 @@ set(DLLWRAP "$(CROSS_SUFFIX)dllwrap") set(OBJCOPY "$(CROSS_SUFFIX)objcopy") set(OBJCONV "$(CROSS_SUFFIX)objconv") -# TODO: convert rest of Makefile.system, left off at "OS dependent settings" + +# +# OS dependent settings +# +include("${CMAKE_SOURCE_DIR}/cmake/os.cmake") From e66aa5f3b7bb58fe7e3b94461978d317deba3e39 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 11:32:20 -0600 Subject: [PATCH 030/257] Ported arch dependent settings from Makefile.system to new cmake file. --- CMakeLists.txt | 3 -- cmake/arch.cmake | 115 +++++++++++++++++++++++++++++++++++++++++++ cmake/c_check.cmake | 3 -- cmake/os.cmake | 6 +-- cmake/prebuild.cmake | 3 -- cmake/system.cmake | 8 +-- 6 files changed, 122 insertions(+), 16 deletions(-) create mode 100644 cmake/arch.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index bf1563d0b..66292940d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,5 @@ ## ## Author: Hank Anderson -## Copyright: (c) Stat-Ease, Inc. -## Created: 12/23/14 -## Last Modified: 12/23/14 ## cmake_minimum_required(VERSION 2.8.4) diff --git a/cmake/arch.cmake b/cmake/arch.cmake new file mode 100644 index 000000000..9b459ae90 --- /dev/null +++ b/cmake/arch.cmake @@ -0,0 +1,115 @@ +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Sets various variables based on architecture. + +if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") + + if (${ARCH} STREQUAL "x86") + if (NOT BINARY) + set(NO_BINARY_MODE 1) + endif () + endif () + + if (NOT NO_EXPRECISION) + if (${Fortran_COMPILER_NAME} MATCHES "gfortran.*") + # N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa + if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") + set(EXPRECISION 1) + set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double") + set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") + endif () + if (${CMAKE_C_COMPILER} STREQUAL "Clang") + set(EXPRECISION 1) + set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION") + set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") + endif () + endif () + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "Intel") + set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") +endif () + +if (USE_OPENMP) + + if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") + set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "Clang") + message(WARNING "Clang doesn't support OpenMP yet.") + set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "Intel") + set(CCOMMON_OPT "${CCOMMON_OPT} -openmp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "PGI") + set(CCOMMON_OPT "${CCOMMON_OPT} -mp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") + set(CCOMMON_OPT "${CCOMMON_OPT} -mp") + set(CEXTRALIB "${CEXTRALIB} -lstdc++") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") + set(CCOMMON_OPT "${CCOMMON_OPT} -mp") + endif () +endif () + + +if (DYNAMIC_ARCH) + if (${ARCH} STREQUAL "x86") + set(DYNAMIC_CORE "KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO") + endif () + + if (${ARCH} STREQUAL "x86_64") + set(DYNAMIC_CORE "PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO") + if (NOT NO_AVX) + set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER") + endif () + if (NOT NO_AVX2) + set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL") + endif () + endif () + + if (NOT DYNAMIC_CORE) + unset(DYNAMIC_ARCH) + endif () +endif () + +if (${ARCH} STREQUAL "ia64") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) + + if (${Fortran_COMPILER_NAME} MATCHES "gfortran.*") + if (${CMAKE_C_COMPILER} STREQUAL "GNU") + # EXPRECISION = 1 + # CCOMMON_OPT += -DEXPRECISION + endif + endif +endif + +if (${ARCH} STREQUAL "mips64") + set(NO_BINARY_MODE 1) +endif + +if (${ARCH} STREQUAL "alpha") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +endif () + +if (${ARCH} STREQUAL "arm") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +endif () + +if (${ARCH} STREQUAL "arm64") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +endif () + diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index d8facfedc..2fbfd5745 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -1,8 +1,5 @@ ## ## Author: Hank Anderson -## Copyright: (c) Stat-Ease, Inc. -## Created: 12/29/14 -## Last Modified: 12/29/14 ## Description: Ported from the OpenBLAS/c_check perl script. ## This is triggered by prebuild.cmake and runs before any of the code is built. ## Creates config.h and Makefile.conf. diff --git a/cmake/os.cmake b/cmake/os.cmake index d897a2506..fc2c40268 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -1,7 +1,5 @@ ## ## Author: Hank Anderson -## Created: 12/29/14 -## Last Modified: 12/29/14 ## Description: Ported from portion of OpenBLAS/Makefile.system ## Detects the OS and sets appropriate variables. @@ -58,7 +56,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") # Ensure the correct stack alignment on Win32 # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 - if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86") + if (${ARCH} STREQUAL "x86") set(CCOMMON_OPT "${CCOMMON_OPT} -mincoming-stack-boundary=2") set(FCOMMON_OPT "${FCOMMON_OPT} -mincoming-stack-boundary=2") endif () @@ -88,7 +86,7 @@ if (QUAD_PRECISION) set(NO_EXPRECISION 1) endif () -if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86") +if (${ARCH} STREQUAL "x86") set(NO_EXPRECISION 1) endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 9595dab0d..8e05647a3 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1,8 +1,5 @@ ## ## Author: Hank Anderson -## Copyright: (c) Stat-Ease, Inc. -## Created: 12/29/14 -## Last Modified: 12/29/14 ## Description: Ported from OpenBLAS/Makefile.prebuild ## This is triggered by system.cmake and runs before any of the code is built. ## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files). diff --git a/cmake/system.cmake b/cmake/system.cmake index 1d9c4612d..3fa75d65d 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -1,8 +1,5 @@ ## ## Author: Hank Anderson -## Copyright: (c) Stat-Ease, Inc. -## Created: 12/29/14 -## Last Modified: 12/29/14 ## Description: Ported from OpenBLAS/Makefile.system ## @@ -134,3 +131,8 @@ set(OBJCONV "$(CROSS_SUFFIX)objconv") # include("${CMAKE_SOURCE_DIR}/cmake/os.cmake") +# +# Architecture dependent settings +# +include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake") + From af11aff3093807bc1c0e5311cc525873e829a2bf Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 12:00:49 -0600 Subject: [PATCH 031/257] Ported C compiler settings from Makefile.system into new cmake file. --- cmake/cc.cmake | 66 ++++++++++++++++++++++++++++++++++++++++++++++ cmake/system.cmake | 12 ++++----- 2 files changed, 71 insertions(+), 7 deletions(-) create mode 100644 cmake/cc.cmake diff --git a/cmake/cc.cmake b/cmake/cc.cmake new file mode 100644 index 000000000..b6ce0e281 --- /dev/null +++ b/cmake/cc.cmake @@ -0,0 +1,66 @@ +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Sets C related variables. + +if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang") + + set(CCOMMON_OPT "${CCOMMON_OPT} -Wall") + COMMON_PROF += -fno-inline + NO_UNINITIALIZED_WARN = -Wno-uninitialized + + if (QUIET_MAKE) + set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused") + endif () + + if (NO_BINARY_MODE) + + if (${ARCH} STREQUAL "mips64") + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=n32") + endif () + set(BINARY_DEFINED 1) + endif () + + if (${CORE} STREQUAL "LOONGSON3A") + set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64") + set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") + endif () + + if (${CORE} STREQUAL "LOONGSON3B") + set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64") + set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") + endif () + + if (${OSNAME} STREQUAL "AIX") + set(BINARY_DEFINED 1) + endif () + endif () + + if (NOT BINARY_DEFINED) + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -m64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + endif () + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "PGI") + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7") + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -m64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + endif () +endif () + diff --git a/cmake/system.cmake b/cmake/system.cmake index 3fa75d65d..d46538df4 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -125,14 +125,12 @@ set(DLLWRAP "$(CROSS_SUFFIX)dllwrap") set(OBJCOPY "$(CROSS_SUFFIX)objcopy") set(OBJCONV "$(CROSS_SUFFIX)objconv") - -# -# OS dependent settings -# +# OS dependent settings include("${CMAKE_SOURCE_DIR}/cmake/os.cmake") -# -# Architecture dependent settings -# +# Architecture dependent settings include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake") +# C Compiler dependent settings +include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake") + From 2d5b442f5bdb4a2cacededabe5142b8ff1a2cadd Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 12:32:23 -0600 Subject: [PATCH 032/257] Ported Fortran configuration code from Makefile.system to fc.cmake. --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index d46538df4..c81afb9a4 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -134,3 +134,6 @@ include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake") # C Compiler dependent settings include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake") +# Fortran Compiler dependent settings +include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake") + From e4bfbd8258948507512cda5f4181490641570da8 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 13:08:59 -0600 Subject: [PATCH 033/257] Added fc.cmake (forgot it in last commit). Moved a couple C compiler ifs from Makefile.system into cc.cmake. --- cmake/cc.cmake | 37 ++++++++++ cmake/fc.cmake | 193 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 230 insertions(+) create mode 100644 cmake/fc.cmake diff --git a/cmake/cc.cmake b/cmake/cc.cmake index b6ce0e281..0cae8f9cf 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -64,3 +64,40 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") endif () endif () +if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") + + if (${ARCH} STREQUAL "mips64") + + if (NOT BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -n32") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -n64") + endif () + + if (${CORE} STREQUAL "LOONGSON3A") + set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static") + endif () + + if (${CORE} STREQUAL "LOONGSON3B") + set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static") + endif () + + else () + + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -m64") + endif () + endif +endif + +if (${CMAKE_C_COMPILER} STREQUAL "SUN") + set(CCOMMON_OPT "${CCOMMON_OPT} -w") + if (${ARCH} STREQUAL "x86") + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () +endif () + diff --git a/cmake/fc.cmake b/cmake/fc.cmake new file mode 100644 index 000000000..727098d34 --- /dev/null +++ b/cmake/fc.cmake @@ -0,0 +1,193 @@ +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Sets Fortran related variables. + +if (${Fortran_COMPILER_NAME} STREQUAL "G77") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + if (NOT NO_BINARY_MODE) + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + endif () + endif () +endif () + +if (${Fortran_COMPILER_NAME} STREQUAL "G95") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G95") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + if (NOT NO_BINARY_MODE) + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + endif () + endif () +endif () + +if (${Fortran_COMPILER_NAME} STREQUAL "GFORTRAN") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc + if (NOT NO_LAPACK) + set(EXTRALIB "{EXTRALIB} -lgfortran") + endif () + if (NO_BINARY_MODE) + if (${ARCH} STREQUAL "mips64") + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") + endif () + endif () + else () + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + endif () + endif () + + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") + endif () +endif () + +if (${Fortran_COMPILER_NAME} STREQUAL "INTEL") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + +if (${Fortran_COMPILER_NAME} STREQUAL "FUJITSU") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + +if (${Fortran_COMPILER_NAME} STREQUAL "IBM") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") + # FCOMMON_OPT += -qarch=440 + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -q64") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -qintsize=8") + endif () + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -q32") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + +if (${Fortran_COMPILER_NAME} STREQUAL "PGI") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI") + set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER") + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7-64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + endif () +endif () + +if (${Fortran_COMPILER_NAME} STREQUAL "PATHSCALE") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PATHSCALE") + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + endif () + + if (NOT ${ARCH} STREQUAL "mips64") + if (NOT BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () + else () + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") + endif () + endif () + + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + endif () +endif () + +if (${Fortran_COMPILER_NAME} STREQUAL "OPEN64") + + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_OPEN64") + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + endif () + + if (${ARCH} STREQUAL "mips64") + + if (NOT BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -n32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -n64") + endif () + + if (${CORE} STREQUAL "LOONGSON3A") + set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static") + endif () + + if (${CORE} STREQUAL "LOONGSON3B") + set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static") + endif () + else () + if (NOT BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () + endif () + + if (USE_OPENMP) + set(FEXTRALIB "${FEXTRALIB} -lstdc++") + set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + endif () +endif () + +if (${Fortran_COMPILER_NAME} "SUN") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_SUN") + if (${ARCH} STREQUAL "x86") + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") + endif () +endif () + +if (${Fortran_COMPILER_NAME} STREQUAL "COMPAQ") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + From e818ace11af47ddacc5d5b9856b4d9db54ca9f98 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 13:34:41 -0600 Subject: [PATCH 034/257] Ported more of Makefile.system to CMake. --- cmake/system.cmake | 204 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 197 insertions(+), 7 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index c81afb9a4..ad4a6f3be 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -117,13 +117,13 @@ endif () unset(ARFLAGS) set(CPP "${COMPILER} -E") set(AR "${CROSS_SUFFIX}ar") -set(AS "$(CROSS_SUFFIX)as") -set(LD "$(CROSS_SUFFIX)ld") -set(RANLIB "$(CROSS_SUFFIX)ranlib") -set(NM "$(CROSS_SUFFIX)nm") -set(DLLWRAP "$(CROSS_SUFFIX)dllwrap") -set(OBJCOPY "$(CROSS_SUFFIX)objcopy") -set(OBJCONV "$(CROSS_SUFFIX)objconv") +set(AS "${CROSS_SUFFIX}as") +set(LD "${CROSS_SUFFIX}ld") +set(RANLIB "${CROSS_SUFFIX}ranlib") +set(NM "${CROSS_SUFFIX}nm") +set(DLLWRAP "${CROSS_SUFFIX}dllwrap") +set(OBJCOPY "${CROSS_SUFFIX}objcopy") +set(OBJCONV "${CROSS_SUFFIX}objconv") # OS dependent settings include("${CMAKE_SOURCE_DIR}/cmake/os.cmake") @@ -137,3 +137,193 @@ include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake") # Fortran Compiler dependent settings include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake") +if (BINARY64) + if (INTERFACE64) + # CCOMMON_OPT += -DUSE64BITINT + endif () +endif () + +if (NEED_PIC) + if (${CMAKE_C_COMPILER} STREQUAL "IBM") + set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -fPIC") + endif () + + if (${Fortran_COMPILER_NAME} STREQUAL "SUN") + set(FCOMMON_OPT "${FCOMMON_OPT} -pic") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") + endif () +endif () + +if (DYNAMIC_ARCH) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") +endif () + +if (NO_LAPACK) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") + #Disable LAPACK C interface + set(NO_LAPACKE 1) +endif () + +if (NO_LAPACKE) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACKE") +endif () + +if (NO_AVX) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX") +endif () + +if (${ARCH} STREQUAL "x86") + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX") +endif () + +if (NO_AVX2) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2") +endif () + +if (SMP) + set(CCOMMON_OPT "${CCOMMON_OPT} -DSMP_SERVER") + + if (${ARCH} STERQUAL "mips64") + if (NOT ${CORE} STREQUAL "LOONGSON3B") + set(USE_SIMPLE_THREADED_LEVEL3 1) + endif () + endif () + + if (USE_OPENMP) + # USE_SIMPLE_THREADED_LEVEL3 = 1 + # NO_AFFINITY = 1 + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") + endif () + + if (BIGNUMA) + set(CCOMMON_OPT "${CCOMMON_OPT} -DBIGNUMA") + endif () + +endif () + +if (NO_WARMUP) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_WARMUP") +endif () + +if (CONSISTENT_FPCSR) + set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") +endif () + +# Only for development +# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") +# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") +# set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_SWITCHING") +# set(USE_PAPI 1) + +if (USE_PAPI) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_PAPI") + set(EXTRALIB "${EXTRALIB} -lpapi -lperfctr") +endif () + +if (DYNAMIC_THREADS) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_THREADS") +endif () + +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") + +if (USE_SIMPLE_THREADED_LEVEL3) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") +endif () + +if (DEFINED LIBNAMESUFFIX) + set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") +else () + set(LIBPREFIX "libopenblas") +endif () + +if (NOT DEFINED SYMBOLPREFIX) + set(SYMBOLPREFIX "") +endif () + +if (NOT DEFINED SYMBOLSUFFIX) + set(SYMBOLSUFFIX "") +endif () + +set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}") + +# TODO: nead to convert these Makefiles +# include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake + +# TODO: Need to figure out how to get $(*F) in cmake +set(CCOMMON_OPT "${CCOMMON_OPT} -DASMNAME=${FU}$(*F) -DASMFNAME=${FU}$(*F)${BU} -DNAME=$(*F)${BU} -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)${BU}\" -DCHAR_CNAME=\"$(*F)\"") + +if (${CORE} STREQUAL "PPC440") + set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC") +endif () + +if (${CORE} STREQUAL "PPC440FP2") + set(STATIC_ALLOCATION 1) +endif () + +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + set(NO_AFFINITY 1) +endif () + +if (NOT ${ARCH} STREQUAL "x86_64" AND NOT ${ARCH} STREQUAL "x86" AND NOT ${CORE} STREQUAL "LOONGSON3B") + set(NO_AFFINITY 1) +endif () + +if (NO_AFFINITY) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AFFINITY") +endif () + +if (FUNCTION_PROFILE) + set(CCOMMON_OPT "${CCOMMON_OPT} -DFUNCTION_PROFILE") +endif () + +if (HUGETLB_ALLOCATION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_HUGETLB") +endif () + +if (DEFINED HUGETLBFILE_ALLOCATION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=${HUGETLBFILE_ALLOCATION})") +endif () + +if (STATIC_ALLOCATION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_STATIC") +endif () + +if (DEVICEDRIVER_ALLOCATION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\"") +endif () + +if (MIXED_MEMORY_ALLOCATION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") +endif () + +if (${OSNAME} STREQUAL "SunOS") + set(TAR gtar) + set(PATCH gpatch) + set(GREP ggrep) +else () + set(TAR tar) + set(PATCH patch) + set(GREP grep) +endif () + +if (NOT DEFINED MD5SUM) + set(MD5SUM md5sum) +endif () + +set(AWK awk) + +set(REVISION "-r${VERSION}") +string(REGEX MATCH "[0-9]+\\." MAJOR_VERSION "${VERSION}") + +if (DEBUG) + set(COMMON_OPT "${COMMON_OPT} -g") +endif () + +if (NOT DEFINED COMMON_OPT) + set(COMMON_OPT "-O2") +endif () + + From be1ce38f24f7c392589a93a053980719ad5be58f Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 14:00:29 -0600 Subject: [PATCH 035/257] Fixed some missing parentheses. --- cmake/arch.cmake | 8 ++++---- cmake/os.cmake | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 9b459ae90..b32c8b654 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -90,13 +90,13 @@ if (${ARCH} STREQUAL "ia64") if (${CMAKE_C_COMPILER} STREQUAL "GNU") # EXPRECISION = 1 # CCOMMON_OPT += -DEXPRECISION - endif - endif -endif + endif () + endif () +endif () if (${ARCH} STREQUAL "mips64") set(NO_BINARY_MODE 1) -endif +endif () if (${ARCH} STREQUAL "alpha") set(NO_BINARY_MODE 1) diff --git a/cmake/os.cmake b/cmake/os.cmake index fc2c40268..c333cdbe0 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -73,7 +73,7 @@ endif () if (CYGWIN) set(NEED_PIC 0) set(NO_EXPRECISION 1) -endif +endif () if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") if (SMP) @@ -98,5 +98,5 @@ endif () if (SANITY_CHECK) # TODO: need some way to get $(*F) (target filename) set(CCOMMON_OPT "${CCOMMON_OPT} -DSANITY_CHECK -DREFNAME=$(*F)f${BU}") -endif +endif () From 30be55150258a38e0ca0db133d1d4754e58f404e Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 14:21:22 -0600 Subject: [PATCH 036/257] Corrected fortran compiler name variables. Fixed some typos. Updated c_check to set ARCH and BINARY64/32. Added version variables. --- CMakeLists.txt | 4 ++++ cmake/arch.cmake | 4 ++-- cmake/c_check.cmake | 40 ++++++++++++++++++++++++++++++++++------ cmake/cc.cmake | 8 ++++---- cmake/f_check.cmake | 15 ++++++++++++++- cmake/fc.cmake | 22 +++++++++++----------- cmake/prebuild.cmake | 15 +++------------ cmake/system.cmake | 10 +++++----- 8 files changed, 77 insertions(+), 41 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 66292940d..0869e6fad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,10 @@ cmake_minimum_required(VERSION 2.8.4) project(OpenBLAS) +set(OpenBLAS_MAJOR_VERSION 0) +set(OpenBLAS_MINOR_VERSION 2) +set(OpenBLAS_PATCH_VERSION 13) +set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # is this necessary? lapack-netlib has its own fortran checks in its CMakeLists.txt #enable_language(Fortran) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index b32c8b654..5848c2409 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -12,7 +12,7 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") endif () if (NOT NO_EXPRECISION) - if (${Fortran_COMPILER_NAME} MATCHES "gfortran.*") + if (${CMAKE_Fortran_COMPILER} MATCHES "gfortran.*") # N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") set(EXPRECISION 1) @@ -86,7 +86,7 @@ if (${ARCH} STREQUAL "ia64") set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) - if (${Fortran_COMPILER_NAME} MATCHES "gfortran.*") + if (${CMAKE_Fortran_COMPILER} MATCHES "gfortran.*") if (${CMAKE_C_COMPILER} STREQUAL "GNU") # EXPRECISION = 1 # CCOMMON_OPT += -DEXPRECISION diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 2fbfd5745..961bb00c4 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -4,6 +4,26 @@ ## This is triggered by prebuild.cmake and runs before any of the code is built. ## Creates config.h and Makefile.conf. +# CMake vars set by this file: +# OSNAME (use CMAKE_SYSTEM_NAME) +# ARCH +# C_COMPILER (use CMAKE_C_COMPILER) +# BINARY32 +# BINARY64 +# FU +# CROSS_SUFFIX +# CROSS +# CEXTRALIB + +# Defines set by this file: +# OS_ +# ARCH_ +# C_ +# __32BIT__ +# __64BIT__ +# FUNDERSCORE +# PTHREAD_CREATE_FUNC + # N.B. c_check (and ctest.c) is not cross-platform, so instead try to use CMake variables. # TODO: detect NEED_FU @@ -23,17 +43,25 @@ if (NOT DEFINED BINARY) endif () endif () +if (BINARY EQUAL 64) + set(BINARY64 1) +else () + set(BINARY32 1) +endif () + # CMake docs define these: # CMAKE_SYSTEM_PROCESSOR - The name of the CPU CMake is building for. # CMAKE_HOST_SYSTEM_PROCESSOR - The name of the CPU CMake is running on. -set(HOST_ARCH ${CMAKE_SYSTEM_PROCESSOR}) -if (${HOST_ARCH} STREQUAL "AMD64") - set(HOST_ARCH "X86_64") +# +# TODO: CMAKE_SYSTEM_PROCESSOR doesn't seem to be correct - instead get it from the compiler a la c_check +set(ARCH ${CMAKE_SYSTEM_PROCESSOR}) +if (${ARCH} STREQUAL "AMD64") + set(ARCH "X86_64") endif () # If you are using a 32-bit compiler on a 64-bit system CMAKE_SYSTEM_PROCESSOR will be wrong -if (${HOST_ARCH} STREQUAL "X86_64" AND BINARY EQUAL 32) - set(HOST_ARCH X86) +if (${ARCH} STREQUAL "X86_64" AND BINARY EQUAL 32) + set(ARCH X86) endif () set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) @@ -43,7 +71,7 @@ endif () file(WRITE ${TARGET_CONF} "#define OS_${HOST_OS}\t1\n" - "#define ARCH_${HOST_ARCH}\t1\n" + "#define ARCH_${ARCH}\t1\n" "#define C_${COMPILER_ID}\t1\n" "#define __${BINARY}BIT__\t1\n" "#define FUNDERSCORE\t${NEED_FU}\n") diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 0cae8f9cf..28daa72dc 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -6,8 +6,8 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang") set(CCOMMON_OPT "${CCOMMON_OPT} -Wall") - COMMON_PROF += -fno-inline - NO_UNINITIALIZED_WARN = -Wno-uninitialized + set(COMMON_PROF "${COMMON_PROF} -fno-inline") + set(NO_UNINITIALIZED_WARN "-Wno-uninitialized") if (QUIET_MAKE) set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused") @@ -89,8 +89,8 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") else () set(CCOMMON_OPT "${CCOMMON_OPT} -m64") endif () - endif -endif + endif () +endif () if (${CMAKE_C_COMPILER} STREQUAL "SUN") set(CCOMMON_OPT "${CCOMMON_OPT} -w") diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index a291430aa..6c1dbdf18 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -7,8 +7,20 @@ ## This is triggered by prebuild.cmake and runs before any of the code is built. ## Appends Fortran information to config.h and Makefile.conf. +# CMake vars set by this file: +# F_COMPILER +# FC +# BU +# NOFORTRAN +# NEED2UNDERSCORES +# FEXTRALIB -if (NOT ${ONLY_CBLAS}) +# Defines set by this file: +# BUNDERSCORE +# NEEDBUNDERSCORE +# NEED2UNDERSCORES + +if (NOT ONLY_CBLAS) # N.B. f_check is not cross-platform, so instead try to use CMake variables # run f_check (appends to TARGET files) # message(STATUS "Running f_check...") @@ -30,6 +42,7 @@ else () #When we only build CBLAS, we set NOFORTRAN=2 set(NOFORTRAN 2) set(NO_FBLAS 1) + #set(F_COMPILER GFORTRAN) # CMake handles the fortran compiler set(BU "_") file(APPEND ${TARGET_CONF} "#define BUNDERSCORE _\n" diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 727098d34..61f0e0187 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,7 +3,7 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. -if (${Fortran_COMPILER_NAME} STREQUAL "G77") +if (${CMAKE_Fortran_COMPILER} STREQUAL "G77") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") if (NOT NO_BINARY_MODE) @@ -15,7 +15,7 @@ if (${Fortran_COMPILER_NAME} STREQUAL "G77") endif () endif () -if (${Fortran_COMPILER_NAME} STREQUAL "G95") +if (${CMAKE_Fortran_COMPILER} STREQUAL "G95") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G95") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") if (NOT NO_BINARY_MODE) @@ -27,7 +27,7 @@ if (${Fortran_COMPILER_NAME} STREQUAL "G95") endif () endif () -if (${Fortran_COMPILER_NAME} STREQUAL "GFORTRAN") +if (${CMAKE_Fortran_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc @@ -58,7 +58,7 @@ if (${Fortran_COMPILER_NAME} STREQUAL "GFORTRAN") endif () endif () -if (${Fortran_COMPILER_NAME} STREQUAL "INTEL") +if (${CMAKE_Fortran_COMPILER} STREQUAL "INTEL") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") @@ -68,14 +68,14 @@ if (${Fortran_COMPILER_NAME} STREQUAL "INTEL") endif () endif () -if (${Fortran_COMPILER_NAME} STREQUAL "FUJITSU") +if (${CMAKE_Fortran_COMPILER} STREQUAL "FUJITSU") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") endif () endif () -if (${Fortran_COMPILER_NAME} STREQUAL "IBM") +if (${CMAKE_Fortran_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") # FCOMMON_OPT += -qarch=440 if (BINARY64) @@ -91,7 +91,7 @@ if (${Fortran_COMPILER_NAME} STREQUAL "IBM") endif () endif () -if (${Fortran_COMPILER_NAME} STREQUAL "PGI") +if (${CMAKE_Fortran_COMPILER} STREQUAL "PGI") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI") set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER") if (BINARY64) @@ -107,7 +107,7 @@ if (${Fortran_COMPILER_NAME} STREQUAL "PGI") endif () endif () -if (${Fortran_COMPILER_NAME} STREQUAL "PATHSCALE") +if (${CMAKE_Fortran_COMPILER} STREQUAL "PATHSCALE") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PATHSCALE") if (BINARY64) if (INTERFACE64) @@ -134,7 +134,7 @@ if (${Fortran_COMPILER_NAME} STREQUAL "PATHSCALE") endif () endif () -if (${Fortran_COMPILER_NAME} STREQUAL "OPEN64") +if (${CMAKE_Fortran_COMPILER} STREQUAL "OPEN64") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_OPEN64") if (BINARY64) @@ -172,7 +172,7 @@ if (${Fortran_COMPILER_NAME} STREQUAL "OPEN64") endif () endif () -if (${Fortran_COMPILER_NAME} "SUN") +if (${CMAKE_Fortran_COMPILER} STREQUAL "SUN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_SUN") if (${ARCH} STREQUAL "x86") set(FCOMMON_OPT "${FCOMMON_OPT} -m32") @@ -184,7 +184,7 @@ if (${Fortran_COMPILER_NAME} "SUN") endif () endif () -if (${Fortran_COMPILER_NAME} STREQUAL "COMPAQ") +if (${CMAKE_Fortran_COMPILER} STREQUAL "COMPAQ") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 8e05647a3..6312a515e 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -6,19 +6,10 @@ ## Next it runs f_check and appends some fortran information to the files. ## Finally it runs getarch and getarch_2nd for even more environment information. -# List of vars set by this file and included files: -# OSNAME -# ARCH -# C_COMPILER -# BINARY32 -# BINARY64 -# CEXTRALIB -# F_COMPILER -# FC -# BU -# CORE <- REQUIRED +# CMake vars set by this file: +# CORE # LIBCORE -# NUM_CORES <- REQUIRED +# NUM_CORES # HAVE_MMX # HAVE_SSE # HAVE_SSE2 diff --git a/cmake/system.cmake b/cmake/system.cmake index ad4a6f3be..f85d13f03 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -150,7 +150,7 @@ if (NEED_PIC) set(CCOMMON_OPT "${CCOMMON_OPT} -fPIC") endif () - if (${Fortran_COMPILER_NAME} STREQUAL "SUN") + if (${CMAKE_Fortran_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -pic") else () set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") @@ -186,7 +186,7 @@ endif () if (SMP) set(CCOMMON_OPT "${CCOMMON_OPT} -DSMP_SERVER") - if (${ARCH} STERQUAL "mips64") + if (${ARCH} STREQUAL "mips64") if (NOT ${CORE} STREQUAL "LOONGSON3B") set(USE_SIMPLE_THREADED_LEVEL3 1) endif () @@ -299,7 +299,7 @@ if (MIXED_MEMORY_ALLOCATION) set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") endif () -if (${OSNAME} STREQUAL "SunOS") +if (${CMAKE_SYSTEM_NAME} STREQUAL "SunOS") set(TAR gtar) set(PATCH gpatch) set(GREP ggrep) @@ -315,8 +315,8 @@ endif () set(AWK awk) -set(REVISION "-r${VERSION}") -string(REGEX MATCH "[0-9]+\\." MAJOR_VERSION "${VERSION}") +set(REVISION "-r${OpenBLAS_VERSION}") +set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) if (DEBUG) set(COMMON_OPT "${COMMON_OPT} -g") From 0ccfa60a537b5057d05abcc2b9e57600f0528515 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 15:09:37 -0600 Subject: [PATCH 037/257] Changed fortran compiler name to be uppercase and stripped of path/ext. --- cmake/arch.cmake | 4 ++-- cmake/f_check.cmake | 4 ++++ cmake/fc.cmake | 22 +++++++++++----------- cmake/system.cmake | 6 ++---- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 5848c2409..34beb71b3 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -12,7 +12,7 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") endif () if (NOT NO_EXPRECISION) - if (${CMAKE_Fortran_COMPILER} MATCHES "gfortran.*") + if (${F_COMPILER} MATCHES "GFORTRAN") # N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") set(EXPRECISION 1) @@ -86,7 +86,7 @@ if (${ARCH} STREQUAL "ia64") set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) - if (${CMAKE_Fortran_COMPILER} MATCHES "gfortran.*") + if (${F_COMPILER} MATCHES "GFORTRAN") if (${CMAKE_C_COMPILER} STREQUAL "GNU") # EXPRECISION = 1 # CCOMMON_OPT += -DEXPRECISION diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index 6c1dbdf18..53552083b 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -48,3 +48,7 @@ else () "#define BUNDERSCORE _\n" "#define NEEDBUNDERSCORE 1\n") endif() + +get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE) +string(TOUPPER ${F_COMPILER} F_COMPILER) + diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 61f0e0187..d7215866c 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,7 +3,7 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. -if (${CMAKE_Fortran_COMPILER} STREQUAL "G77") +if (${F_COMPILER} STREQUAL "G77") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") if (NOT NO_BINARY_MODE) @@ -15,7 +15,7 @@ if (${CMAKE_Fortran_COMPILER} STREQUAL "G77") endif () endif () -if (${CMAKE_Fortran_COMPILER} STREQUAL "G95") +if (${F_COMPILER} STREQUAL "G95") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G95") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") if (NOT NO_BINARY_MODE) @@ -27,7 +27,7 @@ if (${CMAKE_Fortran_COMPILER} STREQUAL "G95") endif () endif () -if (${CMAKE_Fortran_COMPILER} STREQUAL "GFORTRAN") +if (${F_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc @@ -58,7 +58,7 @@ if (${CMAKE_Fortran_COMPILER} STREQUAL "GFORTRAN") endif () endif () -if (${CMAKE_Fortran_COMPILER} STREQUAL "INTEL") +if (${F_COMPILER} STREQUAL "INTEL") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") @@ -68,14 +68,14 @@ if (${CMAKE_Fortran_COMPILER} STREQUAL "INTEL") endif () endif () -if (${CMAKE_Fortran_COMPILER} STREQUAL "FUJITSU") +if (${F_COMPILER} STREQUAL "FUJITSU") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") endif () endif () -if (${CMAKE_Fortran_COMPILER} STREQUAL "IBM") +if (${F_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") # FCOMMON_OPT += -qarch=440 if (BINARY64) @@ -91,7 +91,7 @@ if (${CMAKE_Fortran_COMPILER} STREQUAL "IBM") endif () endif () -if (${CMAKE_Fortran_COMPILER} STREQUAL "PGI") +if (${F_COMPILER} STREQUAL "PGI") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI") set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER") if (BINARY64) @@ -107,7 +107,7 @@ if (${CMAKE_Fortran_COMPILER} STREQUAL "PGI") endif () endif () -if (${CMAKE_Fortran_COMPILER} STREQUAL "PATHSCALE") +if (${F_COMPILER} STREQUAL "PATHSCALE") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PATHSCALE") if (BINARY64) if (INTERFACE64) @@ -134,7 +134,7 @@ if (${CMAKE_Fortran_COMPILER} STREQUAL "PATHSCALE") endif () endif () -if (${CMAKE_Fortran_COMPILER} STREQUAL "OPEN64") +if (${F_COMPILER} STREQUAL "OPEN64") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_OPEN64") if (BINARY64) @@ -172,7 +172,7 @@ if (${CMAKE_Fortran_COMPILER} STREQUAL "OPEN64") endif () endif () -if (${CMAKE_Fortran_COMPILER} STREQUAL "SUN") +if (${F_COMPILER} STREQUAL "SUN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_SUN") if (${ARCH} STREQUAL "x86") set(FCOMMON_OPT "${FCOMMON_OPT} -m32") @@ -184,7 +184,7 @@ if (${CMAKE_Fortran_COMPILER} STREQUAL "SUN") endif () endif () -if (${CMAKE_Fortran_COMPILER} STREQUAL "COMPAQ") +if (${F_COMPILER} STREQUAL "COMPAQ") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") diff --git a/cmake/system.cmake b/cmake/system.cmake index f85d13f03..1f602e021 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -72,7 +72,7 @@ if (CMAKE_CXX_COMPILER STREQUAL loongcc) endif () #if don't use Fortran, it will only compile CBLAS. -if (${ONLY_CBLAS}) +if (ONLY_CBLAS) set(NO_LAPACK 1) else () set(ONLY_CBLAS 0) @@ -81,12 +81,10 @@ endif () include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake") if (NOT DEFINED NUM_THREADS) - # TODO: NUM_CORES comes from `getarch.c` or `cpuid_x86.c`. This is built and executed above in `Makefile.prebuild`, and the results are in `Makefile.conf` and `Makefile_kernel.conf`. -hpa set(NUM_THREADS ${NUM_CORES}) endif () if (${NUM_THREADS} EQUAL 1) - # TODO: was "override USE_THREAD = 0", do we need "override" here? -hpa set(USE_THREAD 0) endif () @@ -150,7 +148,7 @@ if (NEED_PIC) set(CCOMMON_OPT "${CCOMMON_OPT} -fPIC") endif () - if (${CMAKE_Fortran_COMPILER} STREQUAL "SUN") + if (${F_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -pic") else () set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") From 560c96a9a7e96461bd29cf2b984dc26655d98fb7 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 15:11:15 -0600 Subject: [PATCH 038/257] Fixed newlines in some cmake files. --- cmake/arch.cmake | 230 ++++++++++++++-------------- cmake/cc.cmake | 206 ++++++++++++------------- cmake/fc.cmake | 386 +++++++++++++++++++++++------------------------ cmake/os.cmake | 204 ++++++++++++------------- 4 files changed, 513 insertions(+), 513 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 34beb71b3..d6fa3ed5d 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -1,115 +1,115 @@ -## -## Author: Hank Anderson -## Description: Ported from portion of OpenBLAS/Makefile.system -## Sets various variables based on architecture. - -if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") - - if (${ARCH} STREQUAL "x86") - if (NOT BINARY) - set(NO_BINARY_MODE 1) - endif () - endif () - - if (NOT NO_EXPRECISION) - if (${F_COMPILER} MATCHES "GFORTRAN") - # N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa - if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") - set(EXPRECISION 1) - set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double") - set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") - endif () - if (${CMAKE_C_COMPILER} STREQUAL "Clang") - set(EXPRECISION 1) - set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION") - set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") - endif () - endif () - endif () -endif () - -if (${CMAKE_C_COMPILER} STREQUAL "Intel") - set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") -endif () - -if (USE_OPENMP) - - if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") - set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") - endif () - - if (${CMAKE_C_COMPILER} STREQUAL "Clang") - message(WARNING "Clang doesn't support OpenMP yet.") - set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") - endif () - - if (${CMAKE_C_COMPILER} STREQUAL "Intel") - set(CCOMMON_OPT "${CCOMMON_OPT} -openmp") - endif () - - if (${CMAKE_C_COMPILER} STREQUAL "PGI") - set(CCOMMON_OPT "${CCOMMON_OPT} -mp") - endif () - - if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") - set(CCOMMON_OPT "${CCOMMON_OPT} -mp") - set(CEXTRALIB "${CEXTRALIB} -lstdc++") - endif () - - if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") - set(CCOMMON_OPT "${CCOMMON_OPT} -mp") - endif () -endif () - - -if (DYNAMIC_ARCH) - if (${ARCH} STREQUAL "x86") - set(DYNAMIC_CORE "KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO") - endif () - - if (${ARCH} STREQUAL "x86_64") - set(DYNAMIC_CORE "PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO") - if (NOT NO_AVX) - set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER") - endif () - if (NOT NO_AVX2) - set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL") - endif () - endif () - - if (NOT DYNAMIC_CORE) - unset(DYNAMIC_ARCH) - endif () -endif () - -if (${ARCH} STREQUAL "ia64") - set(NO_BINARY_MODE 1) - set(BINARY_DEFINED 1) - - if (${F_COMPILER} MATCHES "GFORTRAN") - if (${CMAKE_C_COMPILER} STREQUAL "GNU") - # EXPRECISION = 1 - # CCOMMON_OPT += -DEXPRECISION - endif () - endif () -endif () - -if (${ARCH} STREQUAL "mips64") - set(NO_BINARY_MODE 1) -endif () - -if (${ARCH} STREQUAL "alpha") - set(NO_BINARY_MODE 1) - set(BINARY_DEFINED 1) -endif () - -if (${ARCH} STREQUAL "arm") - set(NO_BINARY_MODE 1) - set(BINARY_DEFINED 1) -endif () - -if (${ARCH} STREQUAL "arm64") - set(NO_BINARY_MODE 1) - set(BINARY_DEFINED 1) -endif () - +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Sets various variables based on architecture. + +if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") + + if (${ARCH} STREQUAL "x86") + if (NOT BINARY) + set(NO_BINARY_MODE 1) + endif () + endif () + + if (NOT NO_EXPRECISION) + if (${F_COMPILER} MATCHES "GFORTRAN") + # N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa + if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") + set(EXPRECISION 1) + set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double") + set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") + endif () + if (${CMAKE_C_COMPILER} STREQUAL "Clang") + set(EXPRECISION 1) + set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION") + set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") + endif () + endif () + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "Intel") + set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") +endif () + +if (USE_OPENMP) + + if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") + set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "Clang") + message(WARNING "Clang doesn't support OpenMP yet.") + set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "Intel") + set(CCOMMON_OPT "${CCOMMON_OPT} -openmp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "PGI") + set(CCOMMON_OPT "${CCOMMON_OPT} -mp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") + set(CCOMMON_OPT "${CCOMMON_OPT} -mp") + set(CEXTRALIB "${CEXTRALIB} -lstdc++") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") + set(CCOMMON_OPT "${CCOMMON_OPT} -mp") + endif () +endif () + + +if (DYNAMIC_ARCH) + if (${ARCH} STREQUAL "x86") + set(DYNAMIC_CORE "KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO") + endif () + + if (${ARCH} STREQUAL "x86_64") + set(DYNAMIC_CORE "PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO") + if (NOT NO_AVX) + set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER") + endif () + if (NOT NO_AVX2) + set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL") + endif () + endif () + + if (NOT DYNAMIC_CORE) + unset(DYNAMIC_ARCH) + endif () +endif () + +if (${ARCH} STREQUAL "ia64") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) + + if (${F_COMPILER} MATCHES "GFORTRAN") + if (${CMAKE_C_COMPILER} STREQUAL "GNU") + # EXPRECISION = 1 + # CCOMMON_OPT += -DEXPRECISION + endif () + endif () +endif () + +if (${ARCH} STREQUAL "mips64") + set(NO_BINARY_MODE 1) +endif () + +if (${ARCH} STREQUAL "alpha") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +endif () + +if (${ARCH} STREQUAL "arm") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +endif () + +if (${ARCH} STREQUAL "arm64") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +endif () + diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 28daa72dc..de196524f 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -1,103 +1,103 @@ -## -## Author: Hank Anderson -## Description: Ported from portion of OpenBLAS/Makefile.system -## Sets C related variables. - -if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang") - - set(CCOMMON_OPT "${CCOMMON_OPT} -Wall") - set(COMMON_PROF "${COMMON_PROF} -fno-inline") - set(NO_UNINITIALIZED_WARN "-Wno-uninitialized") - - if (QUIET_MAKE) - set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused") - endif () - - if (NO_BINARY_MODE) - - if (${ARCH} STREQUAL "mips64") - if (BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") - else () - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=n32") - endif () - set(BINARY_DEFINED 1) - endif () - - if (${CORE} STREQUAL "LOONGSON3A") - set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64") - set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") - endif () - - if (${CORE} STREQUAL "LOONGSON3B") - set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64") - set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") - endif () - - if (${OSNAME} STREQUAL "AIX") - set(BINARY_DEFINED 1) - endif () - endif () - - if (NOT BINARY_DEFINED) - if (BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -m64") - else () - set(CCOMMON_OPT "${CCOMMON_OPT} -m32") - endif () - endif () -endif () - -if (${CMAKE_C_COMPILER} STREQUAL "PGI") - if (BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64") - else () - set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7") - endif () -endif () - -if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") - if (BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -m64") - else () - set(CCOMMON_OPT "${CCOMMON_OPT} -m32") - endif () -endif () - -if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") - - if (${ARCH} STREQUAL "mips64") - - if (NOT BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -n32") - else () - set(CCOMMON_OPT "${CCOMMON_OPT} -n64") - endif () - - if (${CORE} STREQUAL "LOONGSON3A") - set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static") - endif () - - if (${CORE} STREQUAL "LOONGSON3B") - set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static") - endif () - - else () - - if (BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -m32") - else () - set(CCOMMON_OPT "${CCOMMON_OPT} -m64") - endif () - endif () -endif () - -if (${CMAKE_C_COMPILER} STREQUAL "SUN") - set(CCOMMON_OPT "${CCOMMON_OPT} -w") - if (${ARCH} STREQUAL "x86") - set(CCOMMON_OPT "${CCOMMON_OPT} -m32") - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -m64") - endif () -endif () - +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Sets C related variables. + +if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang") + + set(CCOMMON_OPT "${CCOMMON_OPT} -Wall") + set(COMMON_PROF "${COMMON_PROF} -fno-inline") + set(NO_UNINITIALIZED_WARN "-Wno-uninitialized") + + if (QUIET_MAKE) + set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused") + endif () + + if (NO_BINARY_MODE) + + if (${ARCH} STREQUAL "mips64") + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=n32") + endif () + set(BINARY_DEFINED 1) + endif () + + if (${CORE} STREQUAL "LOONGSON3A") + set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64") + set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") + endif () + + if (${CORE} STREQUAL "LOONGSON3B") + set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64") + set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") + endif () + + if (${OSNAME} STREQUAL "AIX") + set(BINARY_DEFINED 1) + endif () + endif () + + if (NOT BINARY_DEFINED) + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -m64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + endif () + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "PGI") + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7") + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -m64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") + + if (${ARCH} STREQUAL "mips64") + + if (NOT BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -n32") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -n64") + endif () + + if (${CORE} STREQUAL "LOONGSON3A") + set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static") + endif () + + if (${CORE} STREQUAL "LOONGSON3B") + set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static") + endif () + + else () + + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -m64") + endif () + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "SUN") + set(CCOMMON_OPT "${CCOMMON_OPT} -w") + if (${ARCH} STREQUAL "x86") + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () +endif () + diff --git a/cmake/fc.cmake b/cmake/fc.cmake index d7215866c..a47865b63 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -1,193 +1,193 @@ -## -## Author: Hank Anderson -## Description: Ported from portion of OpenBLAS/Makefile.system -## Sets Fortran related variables. - -if (${F_COMPILER} STREQUAL "G77") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77") - set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") - if (NOT NO_BINARY_MODE) - if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -m64") - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -m32") - endif () - endif () -endif () - -if (${F_COMPILER} STREQUAL "G95") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G95") - set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") - if (NOT NO_BINARY_MODE) - if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -m64") - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -m32") - endif () - endif () -endif () - -if (${F_COMPILER} STREQUAL "GFORTRAN") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") - set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") - #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc - if (NOT NO_LAPACK) - set(EXTRALIB "{EXTRALIB} -lgfortran") - endif () - if (NO_BINARY_MODE) - if (${ARCH} STREQUAL "mips64") - if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") - endif () - endif () - else () - if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -m64") - if (INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") - endif () - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -m32") - endif () - endif () - - if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") - endif () -endif () - -if (${F_COMPILER} STREQUAL "INTEL") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") - if (INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -i8") - endif () - if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") - endif () -endif () - -if (${F_COMPILER} STREQUAL "FUJITSU") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") - if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") - endif () -endif () - -if (${F_COMPILER} STREQUAL "IBM") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") - # FCOMMON_OPT += -qarch=440 - if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -q64") - if (INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -qintsize=8") - endif () - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -q32") - endif () - if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") - endif () -endif () - -if (${F_COMPILER} STREQUAL "PGI") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI") - set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER") - if (BINARY64) - if (INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -i8") - endif () - set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7-64") - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7") - endif () - if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -mp") - endif () -endif () - -if (${F_COMPILER} STREQUAL "PATHSCALE") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PATHSCALE") - if (BINARY64) - if (INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -i8") - endif () - endif () - - if (NOT ${ARCH} STREQUAL "mips64") - if (NOT BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -m32") - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -m64") - endif () - else () - if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") - endif () - endif () - - if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -mp") - endif () -endif () - -if (${F_COMPILER} STREQUAL "OPEN64") - - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_OPEN64") - if (BINARY64) - if (INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -i8") - endif () - endif () - - if (${ARCH} STREQUAL "mips64") - - if (NOT BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -n32") - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -n64") - endif () - - if (${CORE} STREQUAL "LOONGSON3A") - set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static") - endif () - - if (${CORE} STREQUAL "LOONGSON3B") - set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static") - endif () - else () - if (NOT BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -m32") - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -m64") - endif () - endif () - - if (USE_OPENMP) - set(FEXTRALIB "${FEXTRALIB} -lstdc++") - set(FCOMMON_OPT "${FCOMMON_OPT} -mp") - endif () -endif () - -if (${F_COMPILER} STREQUAL "SUN") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_SUN") - if (${ARCH} STREQUAL "x86") - set(FCOMMON_OPT "${FCOMMON_OPT} -m32") - else () - set(FCOMMON_OPT "${FCOMMON_OPT} -m64") - endif () - if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") - endif () -endif () - -if (${F_COMPILER} STREQUAL "COMPAQ") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") - if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") - endif () -endif () - +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Sets Fortran related variables. + +if (${F_COMPILER} STREQUAL "G77") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + if (NOT NO_BINARY_MODE) + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + endif () + endif () +endif () + +if (${F_COMPILER} STREQUAL "G95") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G95") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + if (NOT NO_BINARY_MODE) + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + endif () + endif () +endif () + +if (${F_COMPILER} STREQUAL "GFORTRAN") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc + if (NOT NO_LAPACK) + set(EXTRALIB "{EXTRALIB} -lgfortran") + endif () + if (NO_BINARY_MODE) + if (${ARCH} STREQUAL "mips64") + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") + endif () + endif () + else () + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + endif () + endif () + + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "INTEL") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "FUJITSU") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "IBM") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") + # FCOMMON_OPT += -qarch=440 + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -q64") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -qintsize=8") + endif () + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -q32") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "PGI") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI") + set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER") + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7-64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "PATHSCALE") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PATHSCALE") + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + endif () + + if (NOT ${ARCH} STREQUAL "mips64") + if (NOT BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () + else () + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") + endif () + endif () + + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "OPEN64") + + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_OPEN64") + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + endif () + + if (${ARCH} STREQUAL "mips64") + + if (NOT BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -n32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -n64") + endif () + + if (${CORE} STREQUAL "LOONGSON3A") + set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static") + endif () + + if (${CORE} STREQUAL "LOONGSON3B") + set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static") + endif () + else () + if (NOT BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () + endif () + + if (USE_OPENMP) + set(FEXTRALIB "${FEXTRALIB} -lstdc++") + set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "SUN") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_SUN") + if (${ARCH} STREQUAL "x86") + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") + endif () +endif () + +if (${F_COMPILER} STREQUAL "COMPAQ") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + diff --git a/cmake/os.cmake b/cmake/os.cmake index c333cdbe0..cf36ef62f 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -1,102 +1,102 @@ -## -## Author: Hank Anderson -## Description: Ported from portion of OpenBLAS/Makefile.system -## Detects the OS and sets appropriate variables. - -if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(ENV{MACOSX_DEPLOYMENT_TARGET} "10.2") # TODO: should be exported as an env var - set(MD5SUM "md5 -r") -endif () - -if (${CMAKE_SYSTEM_NAME} STREQUAL "FreeBSD") - set(MD5SUM "md5 -r") -endif () - -if (${CMAKE_SYSTEM_NAME} STREQUAL "NetBSD") - set(MD5SUM "md5 -n") -endif () - -if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - set(EXTRALIB "${EXTRALIB} -lm") - set(NO_EXPRECISION 1) -endif () - -if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") - set(EXTRALIB "${EXTRALIB} -lm") -endif () - -# TODO: this is probably meant for mingw, not other windows compilers -if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - - set(NEED_PIC 0) - set(NO_EXPRECISION 1) - - set(EXTRALIB "${EXTRALIB} -defaultlib:advapi32") - - # probably not going to use these - set(SUFFIX "obj") - set(PSUFFIX "pobj") - set(LIBSUFFIX "a") - - if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") - set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI") - endif () - - if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - - # Test for supporting MS_ABI - # removed string parsing in favor of CMake's version comparison -hpa - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) - # GCC Version >=4.7 - # It is compatible with MSVC ABI. - set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI") - endif () - endif () - - # Ensure the correct stack alignment on Win32 - # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 - if (${ARCH} STREQUAL "x86") - set(CCOMMON_OPT "${CCOMMON_OPT} -mincoming-stack-boundary=2") - set(FCOMMON_OPT "${FCOMMON_OPT} -mincoming-stack-boundary=2") - endif () - -endif () - -if (${CMAKE_SYSTEM_NAME} STREQUAL "Interix") - set(NEED_PIC 0) - set(NO_EXPRECISION 1) - - set(INTERIX_TOOL_DIR STREQUAL "/opt/gcc.3.3/i586-pc-interix3/bin") -endif () - -if (CYGWIN) - set(NEED_PIC 0) - set(NO_EXPRECISION 1) -endif () - -if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") - if (SMP) - set(EXTRALIB "${EXTRALIB} -lpthread") - endif () -endif () - -if (QUAD_PRECISION) - set(CCOMMON_OPT "${CCOMMON_OPT} -DQUAD_PRECISION") - set(NO_EXPRECISION 1) -endif () - -if (${ARCH} STREQUAL "x86") - set(NO_EXPRECISION 1) -endif () - -if (UTEST_CHECK) - set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK") - set(SANITY_CHECK 1) -endif () - -if (SANITY_CHECK) - # TODO: need some way to get $(*F) (target filename) - set(CCOMMON_OPT "${CCOMMON_OPT} -DSANITY_CHECK -DREFNAME=$(*F)f${BU}") -endif () - +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Detects the OS and sets appropriate variables. + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") + set(ENV{MACOSX_DEPLOYMENT_TARGET} "10.2") # TODO: should be exported as an env var + set(MD5SUM "md5 -r") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "FreeBSD") + set(MD5SUM "md5 -r") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "NetBSD") + set(MD5SUM "md5 -n") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + set(EXTRALIB "${EXTRALIB} -lm") + set(NO_EXPRECISION 1) +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") + set(EXTRALIB "${EXTRALIB} -lm") +endif () + +# TODO: this is probably meant for mingw, not other windows compilers +if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + + set(NEED_PIC 0) + set(NO_EXPRECISION 1) + + set(EXTRALIB "${EXTRALIB} -defaultlib:advapi32") + + # probably not going to use these + set(SUFFIX "obj") + set(PSUFFIX "pobj") + set(LIBSUFFIX "a") + + if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI") + endif () + + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + + # Test for supporting MS_ABI + # removed string parsing in favor of CMake's version comparison -hpa + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + # GCC Version >=4.7 + # It is compatible with MSVC ABI. + set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI") + endif () + endif () + + # Ensure the correct stack alignment on Win32 + # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 + if (${ARCH} STREQUAL "x86") + set(CCOMMON_OPT "${CCOMMON_OPT} -mincoming-stack-boundary=2") + set(FCOMMON_OPT "${FCOMMON_OPT} -mincoming-stack-boundary=2") + endif () + +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Interix") + set(NEED_PIC 0) + set(NO_EXPRECISION 1) + + set(INTERIX_TOOL_DIR STREQUAL "/opt/gcc.3.3/i586-pc-interix3/bin") +endif () + +if (CYGWIN) + set(NEED_PIC 0) + set(NO_EXPRECISION 1) +endif () + +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") + if (SMP) + set(EXTRALIB "${EXTRALIB} -lpthread") + endif () +endif () + +if (QUAD_PRECISION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DQUAD_PRECISION") + set(NO_EXPRECISION 1) +endif () + +if (${ARCH} STREQUAL "x86") + set(NO_EXPRECISION 1) +endif () + +if (UTEST_CHECK) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK") + set(SANITY_CHECK 1) +endif () + +if (SANITY_CHECK) + # TODO: need some way to get $(*F) (target filename) + set(CCOMMON_OPT "${CCOMMON_OPT} -DSANITY_CHECK -DREFNAME=$(*F)f${BU}") +endif () + From 0beea3a5a5dc80df16aad88a5a8a37db349cb26c Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 15:33:56 -0600 Subject: [PATCH 039/257] Converted LAPACK flags from Makefile.system. --- cmake/system.cmake | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 1f602e021..8acbb64c9 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -324,4 +324,38 @@ if (NOT DEFINED COMMON_OPT) set(COMMON_OPT "-O2") endif () +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}") +# TODO: not sure what PFLAGS is -hpa +set(PFLAGS "${PFLAGS} ${COMMON_OPT} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") + +set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${COMMON_OPT} ${FCOMMON_OPT}") +# TODO: not sure what FPFLAGS is -hpa +set(FPFLAGS "${FPFLAGS} ${COMMON_OPT} ${FCOMMON_OPT} ${COMMON_PROF}") + +#For LAPACK Fortran codes. +set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}") +set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") + +#Disable -fopenmp for LAPACK Fortran codes on Windows. +if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + message(STATUS "FFLAGS: ${LAPACK_FFLAGS}") + set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parralel") + foreach (FILTER_FLAG ${FILTER_FLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) + endforeach () +endif () + +set(LAPACK_CFLAGS "${CMAKE_C_CFLAGS} -DHAVE_LAPACK_CONFIG_H") +if (INTERFACE64) + set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_ILP64") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DOPENBLAS_OS_WINDOWS") +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "LSB") + set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") +endif () From 84b3d760c4f9d63b4cbb9f48bd85469a5527a6a2 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 3 Feb 2015 16:05:01 -0600 Subject: [PATCH 040/257] Converted rest of Makefile.system to system.cmake. --- cmake/system.cmake | 164 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 163 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 8acbb64c9..5ad80d618 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -338,7 +338,6 @@ set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") #Disable -fopenmp for LAPACK Fortran codes on Windows. if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - message(STATUS "FFLAGS: ${LAPACK_FFLAGS}") set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parralel") foreach (FILTER_FLAG ${FILTER_FLAGS}) string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) @@ -359,3 +358,166 @@ if (${CMAKE_C_COMPILER} STREQUAL "LSB") set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") endif () +if (NOT DEFINED SUFFIX) + set(SUFFIX o) +endif () + +if (NOT DEFINED PSUFFIX) + set(PSUFFIX po) +endif () + +if (NOT DEFINED LIBSUFFIX) + set(LIBSUFFIX a) +endif () + +if (DYNAMIC_ARCH) + if (DEFINED SMP) + set(LIBNAME "${LIBPREFIX}p${REVISION}.${LIBSUFFIX}") + set(LIBNAME_P "${LIBPREFIX}p${REVISION}_p.${LIBSUFFIX}") + else () + set(LIBNAME "${LIBPREFIX}${REVISION}.${LIBSUFFIX}") + set(LIBNAME_P "${LIBPREFIX}${REVISION}_p.${LIBSUFFIX}") + endif () +else () + if (DEFINED SMP) + set(LIBNAME "${LIBPREFIX}_${LIBCORE}p${REVISION}.${LIBSUFFIX}") + set(LIBNAME_P "${LIBPREFIX}_${LIBCORE}p${REVISION}_p.${LIBSUFFIX}") + else () + set(LIBNAME "${LIBPREFIX}_${LIBCORE}${REVISION}.${LIBSUFFIX}") + set(LIBNAME_P "${LIBPREFIX}_${LIBCORE}${REVISION}_p.${LIBSUFFIX}") + endif () +endif () + + +set(LIBDLLNAME "${LIBPREFIX}.dll") +set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so") +set(LIBDYNNAME "${LIBNAME}.${LIBSUFFIX}.dylib") +set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def") +set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp") +set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip") + +set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}") +set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}") + + +set(LIB_COMPONENTS BLAS) +if (NOT NO_CBLAS) + set(LIB_COMPONENTS "${LIB_COMPONENTS} CBLAS") +endif () + +if (NOT NO_LAPACK) + set(LIB_COMPONENTS "${LIB_COMPONENTS} LAPACK") + if (NOT NO_LAPACKE) + set(LIB_COMPONENTS "${LIB_COMPONENTS} LAPACKE") + endif () +endif () + +if (ONLY_CBLAS) + set(LIB_COMPONENTS CBLAS) +endif () + +#export OSNAME +#export ARCH +#export CORE +#export LIBCORE +#export PGCPATH +#export CONFIG +#export CC +#export FC +#export BU +#export FU +#export NEED2UNDERSCORES +#export USE_THREAD +#export NUM_THREADS +#export NUM_CORES +#export SMP +#export MAKEFILE_RULE +#export NEED_PIC +#export BINARY +#export BINARY32 +#export BINARY64 +#export F_COMPILER +#export C_COMPILER +#export USE_OPENMP +#export CROSS +#export CROSS_SUFFIX +#export NOFORTRAN +#export NO_FBLAS +#export EXTRALIB +#export CEXTRALIB +#export FEXTRALIB +#export HAVE_SSE +#export HAVE_SSE2 +#export HAVE_SSE3 +#export HAVE_SSSE3 +#export HAVE_SSE4_1 +#export HAVE_SSE4_2 +#export HAVE_SSE4A +#export HAVE_SSE5 +#export HAVE_AVX +#export HAVE_VFP +#export HAVE_VFPV3 +#export HAVE_VFPV4 +#export HAVE_NEON +#export KERNELDIR +#export FUNCTION_PROFILE +#export TARGET_CORE +# +#export SGEMM_UNROLL_M +#export SGEMM_UNROLL_N +#export DGEMM_UNROLL_M +#export DGEMM_UNROLL_N +#export QGEMM_UNROLL_M +#export QGEMM_UNROLL_N +#export CGEMM_UNROLL_M +#export CGEMM_UNROLL_N +#export ZGEMM_UNROLL_M +#export ZGEMM_UNROLL_N +#export XGEMM_UNROLL_M +#export XGEMM_UNROLL_N +#export CGEMM3M_UNROLL_M +#export CGEMM3M_UNROLL_N +#export ZGEMM3M_UNROLL_M +#export ZGEMM3M_UNROLL_N +#export XGEMM3M_UNROLL_M +#export XGEMM3M_UNROLL_N + + +#if (USE_CUDA) +# export CUDADIR +# export CUCC +# export CUFLAGS +# export CULIB +#endif + +#.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f +# +#.f.$(SUFFIX): +# $(FC) $(FFLAGS) -c $< -o $(@F) +# +#.f.$(PSUFFIX): +# $(FC) $(FPFLAGS) -pg -c $< -o $(@F) + +# these are not cross-platform +#ifdef BINARY64 +#PATHSCALEPATH = /opt/pathscale/lib/3.1 +#PGIPATH = /opt/pgi/linux86-64/7.1-5/lib +#else +#PATHSCALEPATH = /opt/pathscale/lib/3.1/32 +#PGIPATH = /opt/pgi/linux86/7.1-5/lib +#endif + +#ACMLPATH = /opt/acml/4.3.0 +#ifneq ($(OSNAME), Darwin) +#MKLPATH = /opt/intel/mkl/10.2.2.025/lib +#else +#MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib +#endif +#ATLASPATH = /opt/atlas/3.9.17/opteron +#FLAMEPATH = $(HOME)/flame/lib +#ifneq ($(OSNAME), SunOS) +#SUNPATH = /opt/sunstudio12.1 +#else +#SUNPATH = /opt/SUNWspro +#endif + From a0aeda6187f38d8447edeb811546519aa8b04c2f Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 4 Feb 2015 10:37:34 -0600 Subject: [PATCH 041/257] Added function to set defines for the object names (e.g. -DNAME=dgemm). --- CMakeLists.txt | 2 +- cmake/c_check.cmake | 2 +- cmake/f_check.cmake | 3 +-- cmake/system.cmake | 2 +- cmake/utils.cmake | 32 ++++++++++++++++++++++++++++++++ interface/CMakeLists.txt | 9 +++++---- 6 files changed, 41 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0869e6fad..aae9c60fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ set(OpenBLAS_PATCH_VERSION 13) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # is this necessary? lapack-netlib has its own fortran checks in its CMakeLists.txt -#enable_language(Fortran) +enable_language(Fortran) message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with.") diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 961bb00c4..2e5ce5edc 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -26,7 +26,7 @@ # N.B. c_check (and ctest.c) is not cross-platform, so instead try to use CMake variables. -# TODO: detect NEED_FU +# TODO: detect NEED_FU/FU set(NEED_FU 1) # Convert CMake vars into the format that OpenBLAS expects diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index 53552083b..266cdbb2a 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -27,8 +27,7 @@ if (NOT ONLY_CBLAS) # execute_process(COMMAND perl f_check ${TARGET_MAKE} ${TARGET_CONF} ${CMAKE_Fortran_COMPILER} # WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) - # TODO: is BU makefile macro needed? - # TODO: detect whether underscore needed, set #defines appropriately - use try_compile + # TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile # TODO: set FEXTRALIB flags a la f_check? set(BU "_") diff --git a/cmake/system.cmake b/cmake/system.cmake index 5ad80d618..c17b7502e 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -251,7 +251,7 @@ set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}") # include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake # TODO: Need to figure out how to get $(*F) in cmake -set(CCOMMON_OPT "${CCOMMON_OPT} -DASMNAME=${FU}$(*F) -DASMFNAME=${FU}$(*F)${BU} -DNAME=$(*F)${BU} -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)${BU}\" -DCHAR_CNAME=\"$(*F)\"") +#set(CCOMMON_OPT "${CCOMMON_OPT} -DASMNAME=${FU}$(*F) -DASMFNAME=${FU}$(*F)${BU} -DNAME=$(*F)${BU} -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)${BU}\" -DCHAR_CNAME=\"$(*F)\"") if (${CORE} STREQUAL "PPC440") set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 944e24cc4..6cee74974 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -88,3 +88,35 @@ function(GenerateObjects sources_in defines_in all_defines_in) set(OBJ_LIST_OUT ${OBJ_LIST_OUT} PARENT_SCOPE) endfunction () +# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition +# @param sources_in the source files to build from +# @param float_type_in the float type to define for this build (e.g. SINGLE/DOUBLE/etc) +# @param defines_in (optional) preprocessor definitions that will be applied to all objects +function(GenerateNamedObjects sources_in float_type_in defines_in) + set(OBJ_LIST_OUT "") + foreach (source_file ${sources_in}) + + get_filename_component(source_name ${source_file} NAME_WE) + + string(SUBSTRING ${float_type_in} 0 1 float_char) + string(TOLOWER ${float_char} float_char) + + # build a unique variable name for this obj file by picking two letters from the defines (can't use one in this case) + set(obj_name "${float_char}${source_name}") + + # parse file name + string(REGEX MATCH "^[a-zA-Z_0-9]+" source_name ${source_file}) + string(TOUPPER ${source_name} source_name) + + # now add the object and set the defines + add_library(${obj_name} OBJECT ${source_file}) + set(obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") + list(APPEND obj_defines ${defines_in}) + list(APPEND obj_defines ${float_type_in}) + set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${obj_defines}") + + list(APPEND OBJ_LIST_OUT ${obj_name}) + + endforeach () + set(OBJ_LIST_OUT ${OBJ_LIST_OUT} PARENT_SCOPE) +endfunction () diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 6ef498cb5..6082c55e3 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -39,15 +39,16 @@ if (NOT DEFINED NO_FBLAS) set_target_properties(MIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_MIN") add_library(MAX_OBJ OBJECT max.c) - add_library(DBLAS1OBJS OBJECT ${BLAS1_SOURCES}) - add_library(DBLAS2OBJS OBJECT ${BLAS2_SOURCES}) - add_library(DBLAS3OBJS OBJECT ${BLAS3_SOURCES}) + GenerateNamedObjects("${BLAS1_SOURCES}" "DOUBLE" "") + GenerateNamedObjects("${BLAS2_SOURCES}" "DOUBLE" "") + GenerateNamedObjects("${BLAS3_SOURCES}" "DOUBLE" "") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # trmm is trsm with a compiler flag set add_library(TRMM_OBJ OBJECT trsm.c) set_target_properties(TRMM_OBJ PROPERTIES COMPILE_DEFINITIONS "TRMM") - list(APPEND DBLAS_OBJS "DBLAS1OBJS;DBLAS2OBJS;DBLAS3OBJS;AMAX_OBJ;AMIN_OBJ;MIN_OBJ;MAX_OBJ;TRMM_OBJ") + list(APPEND DBLAS_OBJS "AMAX_OBJ;AMIN_OBJ;MIN_OBJ;MAX_OBJ;TRMM_OBJ") endif () if (NOT DEFINED NO_CBLAS) From 5690cf3f0e26256dbe228e4dd6a419fdbf41e43e Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 4 Feb 2015 10:52:19 -0600 Subject: [PATCH 042/257] Added override for function names in GenerateNamedObjects. The BLAS interface folder should now be generated the correct objects for the DOUBLE case. --- cmake/utils.cmake | 23 +++++++++++++--------- interface/CMakeLists.txt | 41 +++++++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 6cee74974..d02ee1a41 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -92,21 +92,26 @@ endfunction () # @param sources_in the source files to build from # @param float_type_in the float type to define for this build (e.g. SINGLE/DOUBLE/etc) # @param defines_in (optional) preprocessor definitions that will be applied to all objects -function(GenerateNamedObjects sources_in float_type_in defines_in) +# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. +# e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax" +function(GenerateNamedObjects sources_in float_type_in defines_in name_in) set(OBJ_LIST_OUT "") foreach (source_file ${sources_in}) - get_filename_component(source_name ${source_file} NAME_WE) - string(SUBSTRING ${float_type_in} 0 1 float_char) string(TOLOWER ${float_char} float_char) - # build a unique variable name for this obj file by picking two letters from the defines (can't use one in this case) - set(obj_name "${float_char}${source_name}") - - # parse file name - string(REGEX MATCH "^[a-zA-Z_0-9]+" source_name ${source_file}) - string(TOUPPER ${source_name} source_name) + if (NOT name_in) + get_filename_component(source_name ${source_file} NAME_WE) + set(obj_name "${float_char}${source_name}") + else () + # replace * with float_char + if (${name_in} MATCHES "\\*") + string(REPLACE "*" ${float_char} obj_name ${name_in}) + else () + set(obj_name "${float_char}${name_in}") + endif () + endif () # now add the object and set the defines add_library(${obj_name} OBJECT ${source_file}) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 6082c55e3..e2f073d37 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -30,25 +30,36 @@ set(BLAS3_SOURCES if (NOT DEFINED NO_FBLAS) - # N.B. The original Makefile passed in -UUSE_MIN and -UUSE_ABS (where appropriate), no way to do that at a source-level in cmake. REMOVE_DEFINITIONS removes a definition for the rest of the compilation. - add_library(AMAX_OBJ OBJECT max.c) - set_target_properties(AMAX_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_ABS") - add_library(AMIN_OBJ OBJECT max.c) - set_target_properties(AMIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_ABS;USE_MIN") - add_library(MIN_OBJ OBJECT max.c) - set_target_properties(MIN_OBJ PROPERTIES COMPILE_DEFINITIONS "USE_MIN") - add_library(MAX_OBJ OBJECT max.c) - - GenerateNamedObjects("${BLAS1_SOURCES}" "DOUBLE" "") - GenerateNamedObjects("${BLAS2_SOURCES}" "DOUBLE" "") - GenerateNamedObjects("${BLAS3_SOURCES}" "DOUBLE" "") + GenerateNamedObjects("${BLAS1_SOURCES}" "DOUBLE" "" "") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("${BLAS2_SOURCES}" "DOUBLE" "" "") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("${BLAS3_SOURCES}" "DOUBLE" "" "") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # trmm is trsm with a compiler flag set - add_library(TRMM_OBJ OBJECT trsm.c) - set_target_properties(TRMM_OBJ PROPERTIES COMPILE_DEFINITIONS "TRMM") + GenerateNamedObjects("trsm.c" "DOUBLE" "TRMM" "trmm") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + + # max and imax are compiled 4 times + GenerateNamedObjects("max.c" "DOUBLE" "" "") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("max.c" "DOUBLE" "USE_ABS" "amax") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("max.c" "DOUBLE" "USE_ABS;USE_MIN" "amin") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("max.c" "DOUBLE" "USE_MIN" "min") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + + GenerateNamedObjects("imax.c" "DOUBLE" "" "i*max") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("imax.c" "DOUBLE" "USE_ABS" "i*amax") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("imax.c" "DOUBLE" "USE_ABS;USE_MIN" "i*amin") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("imax.c" "DOUBLE" "USE_MIN" "i*min") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - list(APPEND DBLAS_OBJS "AMAX_OBJ;AMIN_OBJ;MIN_OBJ;MAX_OBJ;TRMM_OBJ") endif () if (NOT DEFINED NO_CBLAS) From 58cff2fed8d5ba02862f870745395684dac0b4f1 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 4 Feb 2015 11:30:15 -0600 Subject: [PATCH 043/257] Added CBLAS define/naming convention to GenerateNamedObjects. --- cmake/utils.cmake | 14 +++++++-- interface/CMakeLists.txt | 61 +++++++++++++++++----------------------- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index d02ee1a41..f839245b1 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -94,7 +94,7 @@ endfunction () # @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. # e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax" -function(GenerateNamedObjects sources_in float_type_in defines_in name_in) +function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cblas) set(OBJ_LIST_OUT "") foreach (source_file ${sources_in}) @@ -114,10 +114,18 @@ function(GenerateNamedObjects sources_in float_type_in defines_in name_in) endif () # now add the object and set the defines - add_library(${obj_name} OBJECT ${source_file}) - set(obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") + set(obj_defines ${defines_in}) + + if (use_cblas) + set(obj_name "cblas_${obj_name}") + list(APPEND obj_defines "CBLAS") + endif () + + list(APPEND obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") list(APPEND obj_defines ${defines_in}) list(APPEND obj_defines ${float_type_in}) + + add_library(${obj_name} OBJECT ${source_file}) set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${obj_defines}") list(APPEND OBJ_LIST_OUT ${obj_name}) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index e2f073d37..90fca8b8f 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -27,68 +27,59 @@ set(BLAS3_SOURCES omatcopy.c imatcopy.c ) +# generate the BLAS objs once with and once without cblas +set (CBLAS_FLAGS "") if (NOT DEFINED NO_FBLAS) + list(APPEND CBLAS_FLAGS 0) +endif () + +if (NOT DEFINED NO_CBLAS) + list(APPEND CBLAS_FLAGS 1) +endif () - GenerateNamedObjects("${BLAS1_SOURCES}" "DOUBLE" "" "") +foreach (CBLAS_FLAG ${CBLAS_FLAGS}) + + GenerateNamedObjects("${BLAS1_SOURCES}" "DOUBLE" "" "" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${BLAS2_SOURCES}" "DOUBLE" "" "") + GenerateNamedObjects("${BLAS2_SOURCES}" "DOUBLE" "" "" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${BLAS3_SOURCES}" "DOUBLE" "" "") + GenerateNamedObjects("${BLAS3_SOURCES}" "DOUBLE" "" "" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # trmm is trsm with a compiler flag set - GenerateNamedObjects("trsm.c" "DOUBLE" "TRMM" "trmm") + GenerateNamedObjects("trsm.c" "DOUBLE" "TRMM" "trmm" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # max and imax are compiled 4 times - GenerateNamedObjects("max.c" "DOUBLE" "" "") + GenerateNamedObjects("max.c" "DOUBLE" "" "" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("max.c" "DOUBLE" "USE_ABS" "amax") + GenerateNamedObjects("max.c" "DOUBLE" "USE_ABS" "amax" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("max.c" "DOUBLE" "USE_ABS;USE_MIN" "amin") + GenerateNamedObjects("max.c" "DOUBLE" "USE_ABS;USE_MIN" "amin" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("max.c" "DOUBLE" "USE_MIN" "min") + GenerateNamedObjects("max.c" "DOUBLE" "USE_MIN" "min" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("imax.c" "DOUBLE" "" "i*max") + GenerateNamedObjects("imax.c" "DOUBLE" "" "i*max" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("imax.c" "DOUBLE" "USE_ABS" "i*amax") + GenerateNamedObjects("imax.c" "DOUBLE" "USE_ABS" "i*amax" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("imax.c" "DOUBLE" "USE_ABS;USE_MIN" "i*amin") + GenerateNamedObjects("imax.c" "DOUBLE" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("imax.c" "DOUBLE" "USE_MIN" "i*min") + GenerateNamedObjects("imax.c" "DOUBLE" "USE_MIN" "i*min" ${CBLAS_FLAG}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -endif () - -if (NOT DEFINED NO_CBLAS) - - add_library(ISAMAX_OBJ OBJECT imax.c) - set_target_properties(ISAMAX_OBJ PROPERTIES COMPILE_DEFINITIONS "CBLAS;USE_ABS") - - add_library(CDBLAS1_OBJS OBJECT ${BLAS1_SOURCES}) - add_library(CDBLAS2_OBJS OBJECT ${BLAS2_SOURCES}) - add_library(CDBLAS3_OBJS OBJECT ${BLAS3_SOURCES}) - - # trmm is trsm with a compiler flag set - add_library(CTRMM_OBJ OBJECT trsm.c) - set_target_properties(CTRMM_OBJ PROPERTIES COMPILE_DEFINITIONS "CBLAS;TRMM") - - set_target_properties(CDBLAS1_OBJS PROPERTIES COMPILE_DEFINITIONS "CBLAS") - set_target_properties(CDBLAS2_OBJS PROPERTIES COMPILE_DEFINITIONS "CBLAS") - set_target_properties(CDBLAS3_OBJS PROPERTIES COMPILE_DEFINITIONS "CBLAS") - - list(APPEND DBLAS_OBJS "CDBLAS1_OBJS;CDBLAS2_OBJS;CDBLAS3_OBJS;ISAMAX_OBJ;CTRMM_OBJ") -endif () +endforeach () if (NOT DEFINED NO_LAPACK) - add_library(DLAPACK_OBJS OBJECT + set(LAPACK_SOURCES lapack/getrf.c lapack/getrs.c lapack/potrf.c lapack/getf2.c lapack/potf2.c lapack/laswp.c lapack/gesv.c lapack/lauu2.c lapack/lauum.c lapack/trti2.c lapack/trtri.c ) - list(APPEND DBLAS_OBJS "DLAPACK_OBJS") + GenerateNamedObjects("${LAPACK_SOURCES}" "DOUBLE" "" "" 0) + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endif () set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From 2828f6630c847904973106133fc0e897807a511e Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 4 Feb 2015 14:01:36 -0600 Subject: [PATCH 044/257] Added SMP sources to COMMONOBJS. --- cmake/system.cmake | 3 --- driver/others/CMakeLists.txt | 51 ++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index c17b7502e..2a0678f83 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -250,9 +250,6 @@ set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}") # TODO: nead to convert these Makefiles # include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake -# TODO: Need to figure out how to get $(*F) in cmake -#set(CCOMMON_OPT "${CCOMMON_OPT} -DASMNAME=${FU}$(*F) -DASMFNAME=${FU}$(*F)${BU} -DNAME=$(*F)${BU} -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)${BU}\" -DCHAR_CNAME=\"$(*F)\"") - if (${CORE} STREQUAL "PPC440") set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC") endif () diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index a28cf1e79..0d2a2f486 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -6,8 +6,32 @@ else () set(MEMORY memory.c) endif () +if (SMP) + + if (USE_OPENMP) + set(BLAS_SERVER blas_server_omp.c) + elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(BLAS_SERVER blas_server_win32.c) + endif () + + if (NOT DEFINED BLAS_SERVER) + set(BLAS_SERVER blas_server.c) + endif () + + set(SMP_SOURCES + ${BLAS_SERVER} + divtable.c # TODO: Makefile has -UDOUBLE + blas_l1_thread.c + ) + + if (NOT NO_AFFINITY) + list(APPEND SMP_SOURCES init.c) + endif () +endif () + add_library(COMMON_OBJS OBJECT ${MEMORY} + ${SMP_SOURCES} xerbla.c abs.c # TODO: this is split into c_abs (DOUBLE unset) and z_abs (DOUBLE set) in the Makefile openblas_set_num_threads.c @@ -16,13 +40,6 @@ add_library(COMMON_OBJS OBJECT openblas_error_handle.c ) -#ifdef SMP -#COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) -#ifndef NO_AFFINITY -#COMMONOBJS += init.$(SUFFIX) -#endif -#endif -# #ifeq ($(DYNAMIC_ARCH), 1) #COMMONOBJS += dynamic.$(SUFFIX) #else @@ -48,25 +65,7 @@ add_library(COMMON_OBJS OBJECT list(APPEND DBLAS_OBJS "COMMON_OBJS") #LIBOTHERS = libothers.$(LIBSUFFIX) -# -#ifeq ($(USE_OPENMP), 1) -#BLAS_SERVER = blas_server_omp.c -#else -#ifeq ($(OSNAME), WINNT) -#BLAS_SERVER = blas_server_win32.c -#endif -#ifeq ($(OSNAME), CYGWIN_NT) -#BLAS_SERVER = blas_server_win32.c -#endif -#ifeq ($(OSNAME), Interix) -#BLAS_SERVER = blas_server_win32.c -#endif -#endif -# -#ifndef BLAS_SERVER -#BLAS_SERVER = blas_server.c -#endif -# + #ifeq ($(DYNAMIC_ARCH), 1) #HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) #else From 373a1bdadb2737663a9f43e0644cf5cf3ec953cb Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 4 Feb 2015 15:47:10 -0600 Subject: [PATCH 045/257] Converted lapack/Makefile to cmake. --- CMakeLists.txt | 6 ++-- lapack/CMakeLists.txt | 83 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 lapack/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index aae9c60fd..eb15fa4bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ if (DEFINED SANITY_CHECK) endif () set(SUBDIRS ${BLASDIRS}) -if (NOT ${NO_LAPACK}) +if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () @@ -55,8 +55,8 @@ if (${NO_STATIC} AND ${NO_SHARED}) endif () set(DBLAS_OBJS "") -foreach (BLAS_DIR ${BLASDIRS}) - add_subdirectory(${BLAS_DIR}) +foreach (SUBDIR ${SUBDIRS}) + add_subdirectory(${SUBDIR}) endforeach () # get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt new file mode 100644 index 000000000..c6d051c76 --- /dev/null +++ b/lapack/CMakeLists.txt @@ -0,0 +1,83 @@ + +include_directories(${CMAKE_SOURCE_DIR}) + +# TODO: laswp needs arch specific code +# TODO: getrs needs to be compiled with and without TRANS (and up to TRANS=4 in the complex case) +# TODO: trti2 needs to be compiled with and without UNIT +# TODO: trtri needs to be compiled with and without UNIT + +set(LAPACK_SOURCES + getf2/getf2_k.c + getrf/getrf_single.c + getrs/getrs_single.c + potrf/potrf_U_single.c + potrf/potrf_L_single.c + potf2/potf2_U.c + potf2/potf2_L.c + lauu2/lauu2_U.c + lauu2/lauu2_L.c + lauum/lauum_U_single.c + lauum/lauum_L_single.c + trti2/trti2_U.c + trti2/trti2_L.c + trtri/trtri_U_single.c + trtri/trtri_L_single.c +) + +set(ZLAPACK_SOURCES + getf2/zgetf2_k.c + getrf/getrf_single.c + getrs/zgetrs_single.c + potrf/potrf_U_single.c + potrf/potrf_L_single.c + potf2/potf2_U.c + potf2/potf2_L.c + lauu2/zlauu2_U.c + lauu2/zlauu2_L.c + lauum/lauum_U_single.c + lauum/lauum_L_single.c + trti2/ztrti2_U.c + trti2/ztrti2_L.c + trtri/trtri_U_single.c + trtri/trtri_L_single.c +) + +GenerateNamedObjects("${LAPACK_SOURCES}" "DOUBLE" "" "" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +if (SMP) + + if (USE_OPENMP) + set(GETRF_SRC getrf/getrf_parallel_omp.c) + else () + set(GETRF_SRC getrf/getrf_parallel.c) + endif () + + set(PARALLEL_SOURCES + ${GETRF_SRC} + getrs/getrs_parallel.c + potrf/potrf_U_parallel.c + potrf/potrf_L_parallel.c + lauum/lauum_U_parallel.c + lauum/lauum_L_parallel.c + trtri/trtri_U_parallel.c + trtri/trtri_L_parallel.c + ) + + set(ZPARALLEL_SOURCES + ${GETRF_SRC} + getrs/zgetrs_parallel.c + potrf/potrf_U_parallel.c + potrf/potrf_L_parallel.c + lauum/lauum_U_parallel.c + lauum/lauum_L_parallel.c + trtri/trtri_U_parallel.c + trtri/trtri_L_parallel.c + ) + + GenerateNamedObjects("${PARALLEL_SOURCES}" "DOUBLE" "" "" 0) + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +endif () + +set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS + From 0d7bad1f35b8672f37fd178a5dc5d21868270860 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 5 Feb 2015 09:02:54 -0600 Subject: [PATCH 046/257] Changed GenerateObjects to append combination codes (e.g. dtrmm_TU). --- cmake/utils.cmake | 125 ++++++++++++++++++++++------------- driver/level2/CMakeLists.txt | 2 + driver/level3/CMakeLists.txt | 27 ++++++-- 3 files changed, 100 insertions(+), 54 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index f839245b1..b0c108bbc 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -15,6 +15,8 @@ endfunction () # Returns all combinations of the input list, as a list with colon-separated combinations # E.g. input of A B C returns A B C A:B A:C B:C # N.B. The input is meant to be a list, and to past a list to a function in CMake you must quote it (e.g. AllCombinations("${LIST_VAR}")). +# @returns LIST_OUT a list of combinations +# CODES_OUT a list of codes corresponding to each combination, with N meaning the item is not present, and the first letter of the list item meaning it is presen function(AllCombinations list_in) list(LENGTH list_in list_count) set(num_combos 1) @@ -22,7 +24,10 @@ function(AllCombinations list_in) math(EXPR num_combos "(${num_combos} << ${list_count}) - 1") set(LIST_OUT "") foreach (c RANGE 0 ${num_combos}) + set(current_combo "") + set(current_code "") + # this is a little ridiculous just to iterate through a list w/ indices math(EXPR last_list_index "${list_count} - 1") foreach (list_index RANGE 0 ${last_list_index}) @@ -35,57 +40,24 @@ function(AllCombinations list_in) else () set(current_combo ${list_elem}) endif () + string(SUBSTRING ${list_elem} 0 1 code_char) + else () + set(code_char "N") endif () + set(current_code "${current_code}${code_char}") endforeach () - list(APPEND LIST_OUT ${current_combo}) - endforeach () - list(APPEND LIST_OUT " ") # Empty set is a valic combination, but CMake isn't appending the empty string for some reason, use a space - set(LIST_OUT ${LIST_OUT} PARENT_SCOPE) -endfunction () - -# generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in -# @param sources_in the source files to build from -# @param defines_in the preprocessor definitions that will be combined to create the object files -# @param all_defines_in (optional) preprocessor definitions that will be applied to all objects -function(GenerateObjects sources_in defines_in all_defines_in) - AllCombinations("${defines_in}") - set(define_combos ${LIST_OUT}) - set(OBJ_LIST_OUT "") - foreach (source_file ${sources_in}) - foreach (def_combo ${define_combos}) - - # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with - string(REPLACE ":" ";" def_combo ${def_combo}) - # build a unique variable name for this obj file by picking two letters from the defines (can't use one in this case) - set(obj_name "") - foreach (combo_elem ${def_combo}) - string(REGEX MATCH "^[A-Z][A-Z]" letter ${combo_elem}) - set(obj_name "${obj_name}${letter}") - endforeach () - - # parse file name - string(REGEX MATCH "^[a-zA-Z_0-9]+" source_name ${source_file}) - string(TOUPPER ${source_name} source_name) - - # prepend the uppercased file name to the obj name - set(obj_name "${source_name}_${obj_name}_OBJS") + if (current_combo STREQUAL "") + list(APPEND LIST_OUT " ") # Empty set is a valid combination, but CMake isn't appending the empty string for some reason, use a space + else () + list(APPEND LIST_OUT ${current_combo}) + endif () + list(APPEND CODES_OUT ${current_code}) - # now add the object and set the defines - add_library(${obj_name} OBJECT ${source_file}) - set(cur_defines ${def_combo}) - if ("${cur_defines}" STREQUAL " ") - set(cur_defines ${all_defines_in}) - else () - list(APPEND cur_defines ${all_defines_in}) - endif () - if (cur_defines AND NOT "${cur_defines}" STREQUAL " ") # using space as the empty set - set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${cur_defines}") - endif () - list(APPEND OBJ_LIST_OUT ${obj_name}) - endforeach () endforeach () - set(OBJ_LIST_OUT ${OBJ_LIST_OUT} PARENT_SCOPE) + + set(LIST_OUT ${LIST_OUT} PARENT_SCOPE) + set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) endfunction () # generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition @@ -94,7 +66,10 @@ endfunction () # @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. # e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax" -function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cblas) +# @param replace_k_with replaces the "k" in the filename with this string (e.g. symm_k should be symm_TU) +# @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters) +function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cblas replace_k_with append_with) + set(OBJ_LIST_OUT "") foreach (source_file ${sources_in}) @@ -113,6 +88,12 @@ function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cb endif () endif () + if (replace_k_with) + string(REGEX REPLACE "k$" ${replace_k_with} obj_name ${obj_name}) + else () + set(obj_name "${obj_name}${append_with}") + endif () + # now add the object and set the defines set(obj_defines ${defines_in}) @@ -133,3 +114,53 @@ function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cb endforeach () set(OBJ_LIST_OUT ${OBJ_LIST_OUT} PARENT_SCOPE) endfunction () + +# generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in +# @param sources_in the source files to build from +# @param defines_in the preprocessor definitions that will be combined to create the object files +# @param float_type_in the float type to define for this build (e.g. SINGLE/DOUBLE/etc) +# @param replace_k Replace the "k" in the filename with the define combo letters (else it appends). E.g. symm_k with TRANS and UNIT defined will be symm_TU. +# @param all_defines_in (optional) preprocessor definitions that will be applied to all objects +function(GenerateCombinationObjects sources_in defines_in float_type_in all_defines_in replace_k) + + AllCombinations("${defines_in}") + set(define_combos ${LIST_OUT}) + set(define_codes ${CODES_OUT}) + + set(COMBO_OBJ_LIST_OUT "") + list(LENGTH define_combos num_combos) + math(EXPR num_combos "${num_combos} - 1") + + foreach (c RANGE 0 ${num_combos}) + + list(GET define_combos ${c} define_combo) + list(GET define_codes ${c} define_code) + + foreach (source_file ${sources_in}) + + # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with + string(REPLACE ":" ";" define_combo ${define_combo}) + + # now add the object and set the defines + set(cur_defines ${define_combo}) + if ("${cur_defines}" STREQUAL " ") + set(cur_defines ${all_defines_in}) + else () + list(APPEND cur_defines ${all_defines_in}) + endif () + + set(replace_k_name "") + set(append_name "") + if (replace_k) + set(replace_k_name ${define_code}) + else () + set(append_name ${define_code}) + endif () + + GenerateNamedObjects("${source_file}" "${float_type_in}" "${cur_defines}" "" 0 "${replace_k_name}" "${append_name}") + list(APPEND COMBO_OBJ_LIST_OUT ${obj_name}) + endforeach () + endforeach () + + set(COMBO_OBJ_LIST_OUT ${COMBO_OBJ_LIST_OUT} PARENT_SCOPE) +endfunction () diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 1fbf7c729..bf0a5857e 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -1,6 +1,8 @@ include_directories(${CMAKE_SOURCE_DIR}) +# TODO: These all need NAME/CNAME set (see GenerateNamedObjects) + # sources that need to be compiled twice, once with no flags and once with LOWER set(UL_SOURCES sbmv_k.c diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 57865d18b..4427c8ebf 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -25,19 +25,32 @@ endif () # loop through gemm.c defines set(GEMM_DEFINES NN NT TN TT) foreach (GEMM_DEFINE ${GEMM_DEFINES}) - add_library(GEMM_${GEMM_DEFINE}_OBJS OBJECT gemm.c) - set_target_properties(GEMM_${GEMM_DEFINE}_OBJS PROPERTIES COMPILE_DEFINITIONS ${GEMM_DEFINE}) - list(APPEND DBLAS_OBJS GEMM_${GEMM_DEFINE}_OBJS) + string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) + GenerateNamedObjects("gemm.c" "DOUBLE" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endforeach () +AllCombinations("TRANS;UPPER;UNIT") +set(define_combos ${LIST_OUT}) +foreach (def_combo ${define_combos}) + # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with + string(REPLACE ":" ";" def_combo ${def_combo}) -GenerateObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "") + # build a unique variable name for this obj file by picking two letters from the defines (can't use one in this case) + set(obj_name "") + foreach (combo_elem ${def_combo}) + string(REGEX MATCH "^[A-Z][A-Z]" letter ${combo_elem}) + set(obj_name "${obj_name}${letter}") + endforeach () +endforeach () + +GenerateCombinationObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "DOUBLE" "" 0) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateObjects("symm_k.c" "LOWER;RSIDE" "NN") +GenerateCombinationObjects("symm_k.c" "LOWER;RSIDE" "DOUBLE" "NN" 1) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "") +GenerateCombinationObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "DOUBLE" "" 1) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "") +GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "DOUBLE" "" 1) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) #if (SMP) From cfaf1c678f85c4b6428030069edea02f903764df Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 5 Feb 2015 09:17:18 -0600 Subject: [PATCH 047/257] Added option to append define codes with an underscore. Fixed the code array not getting reset on subsequent AllCombinations calls. --- cmake/utils.cmake | 12 +++++++++--- driver/level3/CMakeLists.txt | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index b0c108bbc..276375740 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -23,6 +23,7 @@ function(AllCombinations list_in) # subtract 1 since we will iterate from 0 to num_combos math(EXPR num_combos "(${num_combos} << ${list_count}) - 1") set(LIST_OUT "") + set(CODES_OUT "") foreach (c RANGE 0 ${num_combos}) set(current_combo "") @@ -119,8 +120,8 @@ endfunction () # @param sources_in the source files to build from # @param defines_in the preprocessor definitions that will be combined to create the object files # @param float_type_in the float type to define for this build (e.g. SINGLE/DOUBLE/etc) -# @param replace_k Replace the "k" in the filename with the define combo letters (else it appends). E.g. symm_k with TRANS and UNIT defined will be symm_TU. # @param all_defines_in (optional) preprocessor definitions that will be applied to all objects +# @param replace_k If 1, replace the "k" in the filename with the define combo letters. E.g. symm_k with TRANS and UNIT defined will be symm_TU. If 0, appends, or if 2 appends with an underscore. function(GenerateCombinationObjects sources_in defines_in float_type_in all_defines_in replace_k) AllCombinations("${defines_in}") @@ -151,10 +152,14 @@ function(GenerateCombinationObjects sources_in defines_in float_type_in all_defi set(replace_k_name "") set(append_name "") - if (replace_k) + if (replace_k EQUAL 1) set(replace_k_name ${define_code}) else () - set(append_name ${define_code}) + if (replace_k EQUAL 2) + set(append_name "_${define_code}") + else () + set(append_name ${define_code}) + endif () endif () GenerateNamedObjects("${source_file}" "${float_type_in}" "${cur_defines}" "" 0 "${replace_k_name}" "${append_name}") @@ -164,3 +169,4 @@ function(GenerateCombinationObjects sources_in defines_in float_type_in all_defi set(COMBO_OBJ_LIST_OUT ${COMBO_OBJ_LIST_OUT} PARENT_SCOPE) endfunction () + diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 4427c8ebf..02a6097e3 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -50,7 +50,7 @@ GenerateCombinationObjects("symm_k.c" "LOWER;RSIDE" "DOUBLE" "NN" 1) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateCombinationObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "DOUBLE" "" 1) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "DOUBLE" "" 1) +GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "DOUBLE" "" 2) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) #if (SMP) From 461e6911270e0cfbe917720284984d23c4ae8fce Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 5 Feb 2015 09:23:47 -0600 Subject: [PATCH 048/257] Codes when define is absent are now a parameter to AllCombinations. The level3 object names should now be correct. --- cmake/utils.cmake | 9 +++++---- driver/level3/CMakeLists.txt | 22 ++++------------------ 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 276375740..075c0ccc2 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -15,9 +15,10 @@ endfunction () # Returns all combinations of the input list, as a list with colon-separated combinations # E.g. input of A B C returns A B C A:B A:C B:C # N.B. The input is meant to be a list, and to past a list to a function in CMake you must quote it (e.g. AllCombinations("${LIST_VAR}")). +# #param absent_codes codes to use when an element is absent from a combination. For example, if you have TRANS;UNIT;UPPER you may want the code to be NNL when nothing is present. # @returns LIST_OUT a list of combinations # CODES_OUT a list of codes corresponding to each combination, with N meaning the item is not present, and the first letter of the list item meaning it is presen -function(AllCombinations list_in) +function(AllCombinations list_in absent_codes_in) list(LENGTH list_in list_count) set(num_combos 1) # subtract 1 since we will iterate from 0 to num_combos @@ -43,7 +44,7 @@ function(AllCombinations list_in) endif () string(SUBSTRING ${list_elem} 0 1 code_char) else () - set(code_char "N") + list(GET absent_codes_in ${list_index} code_char) endif () set(current_code "${current_code}${code_char}") endforeach () @@ -122,9 +123,9 @@ endfunction () # @param float_type_in the float type to define for this build (e.g. SINGLE/DOUBLE/etc) # @param all_defines_in (optional) preprocessor definitions that will be applied to all objects # @param replace_k If 1, replace the "k" in the filename with the define combo letters. E.g. symm_k with TRANS and UNIT defined will be symm_TU. If 0, appends, or if 2 appends with an underscore. -function(GenerateCombinationObjects sources_in defines_in float_type_in all_defines_in replace_k) +function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_type_in all_defines_in replace_k) - AllCombinations("${defines_in}") + AllCombinations("${defines_in}" "${absent_codes_in}") set(define_combos ${LIST_OUT}) set(define_codes ${CODES_OUT}) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 02a6097e3..d9d4da709 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -30,27 +30,13 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endforeach () -AllCombinations("TRANS;UPPER;UNIT") -set(define_combos ${LIST_OUT}) -foreach (def_combo ${define_combos}) - # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with - string(REPLACE ":" ";" def_combo ${def_combo}) - - # build a unique variable name for this obj file by picking two letters from the defines (can't use one in this case) - set(obj_name "") - foreach (combo_elem ${def_combo}) - string(REGEX MATCH "^[A-Z][A-Z]" letter ${combo_elem}) - set(obj_name "${obj_name}${letter}") - endforeach () -endforeach () - -GenerateCombinationObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "DOUBLE" "" 0) +GenerateCombinationObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "N;L;N" "DOUBLE" "" 0) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateCombinationObjects("symm_k.c" "LOWER;RSIDE" "DOUBLE" "NN" 1) +GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "DOUBLE" "NN" 1) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateCombinationObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "DOUBLE" "" 1) +GenerateCombinationObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "U;N" "DOUBLE" "" 1) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "DOUBLE" "" 2) +GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "U" "DOUBLE" "" 2) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) #if (SMP) From 1b62a4f3c9989edf0c4cfdefddb8621ac0638c24 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 5 Feb 2015 09:39:40 -0600 Subject: [PATCH 049/257] Changed some function parameters to optional. --- cmake/utils.cmake | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 075c0ccc2..26900514d 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -70,7 +70,15 @@ endfunction () # e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax" # @param replace_k_with replaces the "k" in the filename with this string (e.g. symm_k should be symm_TU) # @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters) -function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cblas replace_k_with append_with) +function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cblas) + + if (DEFINED ARGV5) + set(replace_k_with ${ARGV5}) + endif () + + if (DEFINED ARGV6) + set(append_with ${ARGV6}) + endif () set(OBJ_LIST_OUT "") foreach (source_file ${sources_in}) From 943fa2fb580a4301652820b75458ddd58718b3df Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 5 Feb 2015 10:49:11 -0600 Subject: [PATCH 050/257] Fixed object names in level2. --- cmake/utils.cmake | 41 +++++++++++++++++++++++------------- driver/level2/CMakeLists.txt | 35 ++++++++++-------------------- driver/level3/CMakeLists.txt | 10 ++++----- 3 files changed, 42 insertions(+), 44 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 26900514d..2a1d105e3 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -68,12 +68,12 @@ endfunction () # @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. # e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax" -# @param replace_k_with replaces the "k" in the filename with this string (e.g. symm_k should be symm_TU) +# @param replace_last_with replaces the last character in the filename with this string (e.g. symm_k should be symm_TU) # @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters) function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cblas) if (DEFINED ARGV5) - set(replace_k_with ${ARGV5}) + set(replace_last_with ${ARGV5}) endif () if (DEFINED ARGV6) @@ -98,8 +98,8 @@ function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cb endif () endif () - if (replace_k_with) - string(REGEX REPLACE "k$" ${replace_k_with} obj_name ${obj_name}) + if (replace_last_with) + string(REGEX REPLACE ".$" ${replace_last_with} obj_name ${obj_name}) else () set(obj_name "${obj_name}${append_with}") endif () @@ -130,8 +130,11 @@ endfunction () # @param defines_in the preprocessor definitions that will be combined to create the object files # @param float_type_in the float type to define for this build (e.g. SINGLE/DOUBLE/etc) # @param all_defines_in (optional) preprocessor definitions that will be applied to all objects -# @param replace_k If 1, replace the "k" in the filename with the define combo letters. E.g. symm_k with TRANS and UNIT defined will be symm_TU. If 0, appends, or if 2 appends with an underscore. -function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_type_in all_defines_in replace_k) +# @param replace_scheme If 1, replace the "k" in the filename with the define combo letters. E.g. symm_k.c with TRANS and UNIT defined will be symm_TU. +# If 0, it will simply append the code, e.g. symm_L.c with TRANS and UNIT will be symm_LTU. +# If 2, it will append the code with an underscore, e.g. symm.c with TRANS and UNIT will be symm_TU. +# If 3, it will insert the code *around* the last character with an underscore, e.g. symm_L.c with TRANS and UNIT will be symm_TLU (required by BLAS level2 objects). +function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_type_in all_defines_in replace_scheme) AllCombinations("${defines_in}" "${absent_codes_in}") set(define_combos ${LIST_OUT}) @@ -159,20 +162,28 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_ list(APPEND cur_defines ${all_defines_in}) endif () - set(replace_k_name "") - set(append_name "") - if (replace_k EQUAL 1) - set(replace_k_name ${define_code}) + set(replace_code "") + set(append_code "") + if (replace_scheme EQUAL 1) + set(replace_code ${define_code}) else () - if (replace_k EQUAL 2) - set(append_name "_${define_code}") + if (replace_scheme EQUAL 2) + set(append_code "_${define_code}") + elseif (replace_scheme EQUAL 3) + # first extract the last letter + string(REGEX MATCH "[a-zA-Z]\\." last_letter ${source_file}) + string(SUBSTRING ${last_letter} 0 1 last_letter) # remove period from match + # break the code up into the first letter and the remaining (should only be 2 anyway) + string(SUBSTRING ${define_code} 0 1 define_code_first) + string(SUBSTRING ${define_code} 1 -1 define_code_second) + set(replace_code "${define_code_first}${last_letter}${define_code_second}") else () - set(append_name ${define_code}) + set(append_code ${define_code}) # replace_scheme should be 0 endif () endif () - GenerateNamedObjects("${source_file}" "${float_type_in}" "${cur_defines}" "" 0 "${replace_k_name}" "${append_name}") - list(APPEND COMBO_OBJ_LIST_OUT ${obj_name}) + GenerateNamedObjects("${source_file}" "${float_type_in}" "${cur_defines}" "" 0 "${replace_code}" "${append_code}") + list(APPEND COMBO_OBJ_LIST_OUT "${OBJ_LIST_OUT}") endforeach () endforeach () diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index bf0a5857e..d596668c4 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -1,8 +1,6 @@ include_directories(${CMAKE_SOURCE_DIR}) -# TODO: These all need NAME/CNAME set (see GenerateNamedObjects) - # sources that need to be compiled twice, once with no flags and once with LOWER set(UL_SOURCES sbmv_k.c @@ -29,30 +27,20 @@ set(NU_SOURCES trsv_L.c ) -# first compile all the objects that don't need specific preprocessor defines -add_library(DBLAS_NONE OBJECT - gbmv_k.c # gbmv_N - ${UL_SOURCES} - ${NU_SOURCES} -) - -# then do objects with transpose/triangular/etc definitions - -# objects that need TRANS set -add_library(DBLAS_T OBJECT gbmv_k.c ${NU_SOURCES}) -set_target_properties(DBLAS_T PROPERTIES COMPILE_DEFINITIONS "TRANS") - # objects that need LOWER set -add_library(DBLAS_L OBJECT ${UL_SOURCES}) -set_target_properties(DBLAS_L PROPERTIES COMPILE_DEFINITIONS "LOWER") - -# objects that need UNIT set -add_library(DBLAS_U OBJECT ${NU_SOURCES}) -set_target_properties(DBLAS_U PROPERTIES COMPILE_DEFINITIONS "UNIT") +GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "DOUBLE" "" 1) +list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) # objects that need TRANS and UNIT set -add_library(DBLAS_TU OBJECT ${NU_SOURCES}) -set_target_properties(DBLAS_TU PROPERTIES COMPILE_DEFINITIONS "UNIT;TRANS") +# N.B. BLAS wants to put the U/L from the filename in the *MIDDLE* because of course why not have a different naming scheme for every single object -hpa +GenerateCombinationObjects("${NU_SOURCES}" "TRANS;UNIT" "N;N" "DOUBLE" "" 3) +list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) + +# gbmv uses a lowercase n and t. WHY? WHO KNOWS! +GenerateNamedObjects("gbmv_k.c" "DOUBLE" "" "gbmv_n" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("gbmv_k.c" "DOUBLE" "TRANS" "gbmv_t" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) #if (DEFINED SMP) # add_library(DBLASOBJS_SMP @@ -81,6 +69,5 @@ set_target_properties(DBLAS_TU PROPERTIES COMPILE_DEFINITIONS "UNIT;TRANS") # ) #endif () -list(APPEND DBLAS_OBJS "DBLAS_NONE;DBLAS_T;DBLAS_L;DBLAS_U;DBLAS_TU") set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index d9d4da709..61133ce92 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -26,18 +26,18 @@ endif () set(GEMM_DEFINES NN NT TN TT) foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) - GenerateNamedObjects("gemm.c" "DOUBLE" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "") + GenerateNamedObjects("gemm.c" "DOUBLE" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endforeach () GenerateCombinationObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "N;L;N" "DOUBLE" "" 0) -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "DOUBLE" "NN" 1) -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) GenerateCombinationObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "U;N" "DOUBLE" "" 1) -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "U" "DOUBLE" "" 2) -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) #if (SMP) # From 627d5e7401e1935b6faffa11654ab7e9c07f204d Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 5 Feb 2015 12:22:48 -0600 Subject: [PATCH 051/257] Added SMP objects to driver/level3. --- cmake/utils.cmake | 13 ++++++++++--- driver/level3/CMakeLists.txt | 31 ++++++++++++++++++------------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 2a1d105e3..286f271e2 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -83,8 +83,10 @@ function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cb set(OBJ_LIST_OUT "") foreach (source_file ${sources_in}) - string(SUBSTRING ${float_type_in} 0 1 float_char) - string(TOLOWER ${float_char} float_char) + if (NOT float_type_in STREQUAL "") + string(SUBSTRING ${float_type_in} 0 1 float_char) + string(TOLOWER ${float_char} float_char) + endif () if (NOT name_in) get_filename_component(source_name ${source_file} NAME_WE) @@ -134,8 +136,13 @@ endfunction () # If 0, it will simply append the code, e.g. symm_L.c with TRANS and UNIT will be symm_LTU. # If 2, it will append the code with an underscore, e.g. symm.c with TRANS and UNIT will be symm_TU. # If 3, it will insert the code *around* the last character with an underscore, e.g. symm_L.c with TRANS and UNIT will be symm_TLU (required by BLAS level2 objects). +# @param alternate_name replaces the source name as the object name (define codes are still appended) function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_type_in all_defines_in replace_scheme) + if (DEFINED ARGV6) + set(alternate_name ${ARGV6}) + endif () + AllCombinations("${defines_in}" "${absent_codes_in}") set(define_combos ${LIST_OUT}) set(define_codes ${CODES_OUT}) @@ -182,7 +189,7 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_ endif () endif () - GenerateNamedObjects("${source_file}" "${float_type_in}" "${cur_defines}" "" 0 "${replace_code}" "${append_code}") + GenerateNamedObjects("${source_file}" "${float_type_in}" "${cur_defines}" "${alternate_name}" 0 "${replace_code}" "${append_code}") list(APPEND COMBO_OBJ_LIST_OUT "${OBJ_LIST_OUT}") endforeach () endforeach () diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 61133ce92..687664949 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -28,6 +28,10 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) GenerateNamedObjects("gemm.c" "DOUBLE" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "DOUBLE" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + endif () endforeach () GenerateCombinationObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "N;L;N" "DOUBLE" "" 0) @@ -39,19 +43,20 @@ list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "U" "DOUBLE" "" 2) list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) -#if (SMP) -# -# COMMONOBJS += gemm_thread_m.c gemm_thread_n.c gemm_thread_mn.c gemm_thread_variable.c -# COMMONOBJS += syrk_thread.c -# -# if (USE_SIMPLE_THREADED_LEVEL3) -# DBLASOBJS += dgemm_thread_nn.c dgemm_thread_nt.c dgemm_thread_tn.c dgemm_thread_tt.c -# DBLASOBJS += dsymm_thread_LU.c dsymm_thread_LL.c dsymm_thread_RU.c dsymm_thread_RL.c -# DBLASOBJS += dsyrk_thread_UN.c dsyrk_thread_UT.c dsyrk_thread_LN.c dsyrk_thread_LT.c -# -# endif () -#endif () -# +if (SMP) + + # N.B. these do NOT have a float type (e.g. DOUBLE) defined! + GenerateNamedObjects("gemm_thread_m.c;gemm_thread_n.c;gemm_thread_mn.c;gemm_thread_variable.c;syrk_thread.c" "" "" "" 0) + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + + if (NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateCombinationObjects("syrk_k.c" "LOWER;TRANS" "U;N" "DOUBLE" "THREADED_LEVEL3" 2 "syrk_thread") + list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) + GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "DOUBLE" "THREADED_LEVEL3;NN" 2 "symm_thread") + list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) + endif () +endif () + #HPLOBJS = # dgemm_nn.c dgemm_nt.c dgemm_tn.c dgemm_tt.c # dtrsm_LNUU.c dtrsm_LNUN.c dtrsm_LNLU.c dtrsm_LNLN.c From 189fadfde03fec13fec682d2c7bf003eced900d0 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 5 Feb 2015 21:05:11 -0600 Subject: [PATCH 052/257] Started implementing kernel/Makefile in cmake. --- CMakeLists.txt | 4 +- cmake/c_check.cmake | 6 +-- cmake/prebuild.cmake | 1 - kernel/CMakeLists.txt | 85 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 6 deletions(-) create mode 100644 kernel/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index eb15fa4bc..41cb52b23 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,8 +9,8 @@ set(OpenBLAS_MINOR_VERSION 2) set(OpenBLAS_PATCH_VERSION 13) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") -# is this necessary? lapack-netlib has its own fortran checks in its CMakeLists.txt enable_language(Fortran) +enable_language(ASM) message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with.") @@ -19,7 +19,7 @@ include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") set(BLASDIRS interface driver/level2 driver/level3 driver/others) -if (NOT ${DYNAMIC_ARCH}) +if (NOT DYNAMIC_ARCH) list(APPEND BLASDIRS kernel) endif () diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 2e5ce5edc..aaa3da7bc 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -26,8 +26,8 @@ # N.B. c_check (and ctest.c) is not cross-platform, so instead try to use CMake variables. -# TODO: detect NEED_FU/FU -set(NEED_FU 1) +# TODO: detect FU (front underscore) by compiling ctest1.c +set(FU "_") # Convert CMake vars into the format that OpenBLAS expects string(TOUPPER ${CMAKE_SYSTEM_NAME} HOST_OS) @@ -74,5 +74,5 @@ file(WRITE ${TARGET_CONF} "#define ARCH_${ARCH}\t1\n" "#define C_${COMPILER_ID}\t1\n" "#define __${BINARY}BIT__\t1\n" - "#define FUNDERSCORE\t${NEED_FU}\n") + "#define FUNDERSCORE\t${FU}\n") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 6312a515e..32faeeea7 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -54,7 +54,6 @@ include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") # compile getarch -enable_language(ASM) set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt new file mode 100644 index 000000000..a36cb8332 --- /dev/null +++ b/kernel/CMakeLists.txt @@ -0,0 +1,85 @@ + +include_directories(${CMAKE_SOURCE_DIR}) + +# TODO: need to read ${KERNELDIR}/KERNEL into CMake vars + + +# Makefile.L1 + +# these are using hardcoded filenames for now, should get them from the KERNEL vars, e.g. DAMAXKERNEL instead of amax.S +GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "USE_ABS" "amax_k" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "USE_ABS;USE_MIN" "amin_k" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "" "max_k" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "" "min_k" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "USE_ABS" "i*amax_k" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "USE_ABS;USE_MIN" "i*amin_k" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "" "i*max_k" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "" "i*min_k" 0) +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +#DBLASOBJS += \ +# dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ +# dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ +# daxpby_k$(TSUFFIX).$(SUFFIX) + +# Makefile.L2 +#DBLASOBJS += \ +# dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) dsymv_U$(TSUFFIX).$(SUFFIX) dsymv_L$(TSUFFIX).$(SUFFIX) \ +# dger_k$(TSUFFIX).$(SUFFIX) + + +# Makefile.L3 +#DKERNELOBJS += \ +# dgemm_kernel$(TSUFFIX).$(SUFFIX) \ +# $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ +# $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) + +#DBLASOBJS += \ +# dgemm_beta$(TSUFFIX).$(SUFFIX) \ +# dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ +# dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ +# dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ +# dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + +#DBLASOBJS += \ +# dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ +# dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ +# dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ +# dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ +# dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ +# dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ +# dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ +# dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ +# dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ +# dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ +# dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ +# dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ +# dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ +# dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ +# dtrsm_outucopy$(TSUFFIX).$(SUFFIX) dtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ +# dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ +# dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ +# dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX) + +#DBLASOBJS += \ +# domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ +# domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) + +# Makefile.LA +#DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) + +set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From 13d2d48e67d5d76714690e4fb6ab46e5a678431a Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Fri, 6 Feb 2015 13:42:20 -0600 Subject: [PATCH 053/257] Added yet another naming scheme for lapack functions. --- cmake/utils.cmake | 15 ++++++++++++++- lapack/CMakeLists.txt | 24 +++++++++++++++++++++--- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 286f271e2..641e7a7c6 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -136,6 +136,7 @@ endfunction () # If 0, it will simply append the code, e.g. symm_L.c with TRANS and UNIT will be symm_LTU. # If 2, it will append the code with an underscore, e.g. symm.c with TRANS and UNIT will be symm_TU. # If 3, it will insert the code *around* the last character with an underscore, e.g. symm_L.c with TRANS and UNIT will be symm_TLU (required by BLAS level2 objects). +# If 4, it will insert the code before the last underscore. E.g. trtri_U_parallel with TRANS will be trtri_UT_parallel # @param alternate_name replaces the source name as the object name (define codes are still appended) function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_type_in all_defines_in replace_scheme) @@ -184,7 +185,19 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_ string(SUBSTRING ${define_code} 0 1 define_code_first) string(SUBSTRING ${define_code} 1 -1 define_code_second) set(replace_code "${define_code_first}${last_letter}${define_code_second}") - else () + elseif (replace_scheme EQUAL 4) + # insert code before the last underscore and pass that in as the alternate_name + get_filename_component(alternate_name ${source_file} NAME_WE) + set(extra_underscore "") + # check if filename has two underscores, insert another if not (e.g. getrs_parallel needs to become getrs_U_parallel not getrsU_parallel) + string(REGEX MATCH "_[a-zA-Z]+_" underscores ${alternate_name}) + string(LENGTH "${underscores}" underscores) + if (underscores EQUAL 0) + set(extra_underscore "_") + endif () + string(REGEX REPLACE "(.+)(_[^_]+)$" "\\1${extra_underscore}${define_code}\\2" alternate_name ${alternate_name}) + message(STATUS ${alternate_name}) + else() set(append_code ${define_code}) # replace_scheme should be 0 endif () endif () diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index c6d051c76..664ce6d6e 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -4,12 +4,10 @@ include_directories(${CMAKE_SOURCE_DIR}) # TODO: laswp needs arch specific code # TODO: getrs needs to be compiled with and without TRANS (and up to TRANS=4 in the complex case) # TODO: trti2 needs to be compiled with and without UNIT -# TODO: trtri needs to be compiled with and without UNIT set(LAPACK_SOURCES getf2/getf2_k.c getrf/getrf_single.c - getrs/getrs_single.c potrf/potrf_U_single.c potrf/potrf_L_single.c potf2/potf2_U.c @@ -20,6 +18,15 @@ set(LAPACK_SOURCES lauum/lauum_L_single.c trti2/trti2_U.c trti2/trti2_L.c +) + +# sources that need TRANS set +set(TRANS_SOURCES + getrs/getrs_single.c +) + +# sources that need UNIT set +set(UNIT_SOURCES trtri/trtri_U_single.c trtri/trtri_L_single.c ) @@ -55,11 +62,17 @@ if (SMP) set(PARALLEL_SOURCES ${GETRF_SRC} - getrs/getrs_parallel.c potrf/potrf_U_parallel.c potrf/potrf_L_parallel.c lauum/lauum_U_parallel.c lauum/lauum_L_parallel.c + ) + + list(APPEND TRANS_SOURCES + getrs/getrs_parallel.c + ) + + list(APPEND UNIT_SOURCES trtri/trtri_U_parallel.c trtri/trtri_L_parallel.c ) @@ -79,5 +92,10 @@ if (SMP) list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endif () +GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "DOUBLE" "" 4) +list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) +GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "DOUBLE" "" 4) +list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) + set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From 6b5d26e07b017f75a803ff64ac12d700fc298478 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Fri, 6 Feb 2015 16:52:19 -0600 Subject: [PATCH 054/257] Added SMP sources to level2 CMakeLists.txt. --- cmake/utils.cmake | 17 ++++++++- driver/level2/CMakeLists.txt | 71 ++++++++++++++++++++---------------- 2 files changed, 55 insertions(+), 33 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 641e7a7c6..2faa2e3e0 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -70,7 +70,21 @@ endfunction () # e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax" # @param replace_last_with replaces the last character in the filename with this string (e.g. symm_k should be symm_TU) # @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters) -function(GenerateNamedObjects sources_in float_type_in defines_in name_in use_cblas) +function(GenerateNamedObjects sources_in float_type_in) + + if (DEFINED ARGV2) + set(defines_in ${ARGV2}) + endif () + + if (DEFINED ARGV3) + set(name_in ${ARGV3}) + endif () + + if (DEFINED ARGV4) + set(use_cblas ${ARGV4}) + else () + set(use_cblas 0) + endif () if (DEFINED ARGV5) set(replace_last_with ${ARGV5}) @@ -196,7 +210,6 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_ set(extra_underscore "_") endif () string(REGEX REPLACE "(.+)(_[^_]+)$" "\\1${extra_underscore}${define_code}\\2" alternate_name ${alternate_name}) - message(STATUS ${alternate_name}) else() set(append_code ${define_code}) # replace_scheme should be 0 endif () diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index d596668c4..a01b37289 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -11,7 +11,7 @@ set(UL_SOURCES syr2_k.c ) -# sources that need to be compiled several times, for UNIT, TRANS +# sources that need to be compiled several times, for UNIT, TRANSA set(NU_SOURCES tbmv_U.c tbsv_U.c @@ -31,43 +31,52 @@ set(NU_SOURCES GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "DOUBLE" "" 1) list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) -# objects that need TRANS and UNIT set +# objects that need TRANSA and UNIT set # N.B. BLAS wants to put the U/L from the filename in the *MIDDLE* because of course why not have a different naming scheme for every single object -hpa -GenerateCombinationObjects("${NU_SOURCES}" "TRANS;UNIT" "N;N" "DOUBLE" "" 3) +GenerateCombinationObjects("${NU_SOURCES}" "TRANSA;UNIT" "N;N" "DOUBLE" "" 3) list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) # gbmv uses a lowercase n and t. WHY? WHO KNOWS! -GenerateNamedObjects("gbmv_k.c" "DOUBLE" "" "gbmv_n" 0) +GenerateNamedObjects("gbmv_k.c" "DOUBLE" "" "gbmv_n") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("gbmv_k.c" "DOUBLE" "TRANS" "gbmv_t" 0) +GenerateNamedObjects("gbmv_k.c" "DOUBLE" "TRANS" "gbmv_t") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -#if (DEFINED SMP) -# add_library(DBLASOBJS_SMP -# dgemv_thread_n.c dgemv_thread_t.c -# dger_thread.c -# dsymv_thread_U.c dsymv_thread_L.c -# dsyr_thread_U.c dsyr_thread_L.c -# dsyr2_thread_U.c dsyr2_thread_L.c -# dspr_thread_U.c dspr_thread_L.c -# dspr2_thread_U.c dspr2_thread_L.c -# dtrmv_thread_NUU.c dtrmv_thread_NUN.c -# dtrmv_thread_NLU.c dtrmv_thread_NLN.c -# dtrmv_thread_TUU.c dtrmv_thread_TUN.c -# dtrmv_thread_TLU.c dtrmv_thread_TLN.c -# dspmv_thread_U.c dspmv_thread_L.c -# dtpmv_thread_NUU.c dtpmv_thread_NUN.c -# dtpmv_thread_NLU.c dtpmv_thread_NLN.c -# dtpmv_thread_TUU.c dtpmv_thread_TUN.c -# dtpmv_thread_TLU.c dtpmv_thread_TLN.c -# dgbmv_thread_n.c dgbmv_thread_t.c -# dsbmv_thread_U.c dsbmv_thread_L.c -# dtbmv_thread_NUU.c dtbmv_thread_NUN.c -# dtbmv_thread_NLU.c dtbmv_thread_NLN.c -# dtbmv_thread_TUU.c dtbmv_thread_TUN.c -# dtbmv_thread_TLU.c dtbmv_thread_TLN.c -# ) -#endif () +if (SMP) + + # gbmv uses a lowercase n and t + GenerateNamedObjects("gbmv_thread.c" "DOUBLE" "" "gbmv_thread_n") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("gbmv_thread.c" "DOUBLE" "TRANS" "gbmv_thread_t") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + + GenerateNamedObjects("ger_thread.c" "DOUBLE") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + + set(UL_SMP_SOURCES + symv_thread.c + syr_thread.c + syr2_thread.c + spr_thread.c + spr2_thread.c + spmv_thread.c + sbmv_thread.c + ) + + GenerateCombinationObjects("${UL_SMP_SOURCES}" "LOWER" "U" "DOUBLE" "" 2) + list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) + + + set(NU_SMP_SOURCES + trmv_thread.c + tpmv_thread.c + tbmv_thread.c + ) + + GenerateCombinationObjects("${NU_SMP_SOURCES}" "TRANSA;LOWER;UNIT" "N;U;N" "DOUBLE" "" 2) + list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) + +endif () set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From 38681fb1c678cdc4b7d8dfac309f4468d3a4c051 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Sat, 7 Feb 2015 12:54:30 -0600 Subject: [PATCH 055/257] Added more kernel files. --- kernel/CMakeLists.txt | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index a36cb8332..6f226b78b 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -7,28 +7,31 @@ include_directories(${CMAKE_SOURCE_DIR}) # Makefile.L1 # these are using hardcoded filenames for now, should get them from the KERNEL vars, e.g. DAMAXKERNEL instead of amax.S -GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "USE_ABS" "amax_k" 0) +GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "USE_ABS" "amax_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "USE_ABS;USE_MIN" "amin_k" 0) +GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "USE_ABS;USE_MIN" "amin_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "" "max_k" 0) +GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "" "max_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "" "min_k" 0) +GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "" "min_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "USE_ABS" "i*amax_k" 0) +GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "USE_ABS" "i*amax_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "USE_ABS;USE_MIN" "i*amin_k" 0) +GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "USE_ABS;USE_MIN" "i*amin_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "" "i*max_k" 0) +GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "" "i*max_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "" "i*min_k" 0) +GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "" "i*min_k") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/scal.S" "DOUBLE" "" "scal_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) #DBLASOBJS += \ @@ -37,6 +40,16 @@ list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # daxpby_k$(TSUFFIX).$(SUFFIX) # Makefile.L2 + +GenerateNamedObjects("${KERNELDIR}/gemv_n.S" "DOUBLE") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/gemv_t.S" "DOUBLE" "TRANS") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "DOUBLE" "" 1) +list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) + #DBLASOBJS += \ # dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) dsymv_U$(TSUFFIX).$(SUFFIX) dsymv_L$(TSUFFIX).$(SUFFIX) \ # dger_k$(TSUFFIX).$(SUFFIX) From 2f59135eb6aee046324e91d5a71d338051f8fd9a Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Sat, 7 Feb 2015 21:15:21 -0600 Subject: [PATCH 056/257] Added gemv to level2 CMakeLists.txt. --- driver/level2/CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index a01b37289..cb8b1c949 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -44,10 +44,15 @@ list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) if (SMP) - # gbmv uses a lowercase n and t + # gbmv uses a lowercase n and t. N.B. this uses TRANSA where gbmv.c uses TRANS. Intentional? GenerateNamedObjects("gbmv_thread.c" "DOUBLE" "" "gbmv_thread_n") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("gbmv_thread.c" "DOUBLE" "TRANS" "gbmv_thread_t") + GenerateNamedObjects("gbmv_thread.c" "DOUBLE" "TRANSA" "gbmv_thread_t") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + + GenerateNamedObjects("gemv_thread.c" "DOUBLE" "" "gemv_thread_n") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("gemv_thread.c" "DOUBLE" "TRANSA" "gemv_thread_t") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("ger_thread.c" "DOUBLE") From fa0e6a6c937a61df610ce47d863b244d96b4c068 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Sat, 7 Feb 2015 21:37:46 -0600 Subject: [PATCH 057/257] Added the rest of the L1 kernel makefile. --- kernel/CMakeLists.txt | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 6f226b78b..5615bf2c2 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -31,13 +31,32 @@ list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "" "i*min_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("${KERNELDIR}/asum.S" "DOUBLE" "" "asum_k") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/axpy.S" "DOUBLE" "" "axpy_k") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/copy.S" "DOUBLE" "C_INTERFACE" "copy_k") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/dot.S" "DOUBLE" "" "dot_k") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/nrm2.S" "DOUBLE" "" "nrm2_k") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("${KERNELDIR}/rot.S" "DOUBLE" "" "rot_k") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + GenerateNamedObjects("${KERNELDIR}/scal.S" "DOUBLE" "" "scal_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -#DBLASOBJS += \ -# dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ -# dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ -# daxpby_k$(TSUFFIX).$(SUFFIX) +GenerateNamedObjects("${KERNELDIR}/swap.S" "DOUBLE" "" "swap_k") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("arm/axpby.c" "DOUBLE" "" "axpby_k") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # Makefile.L2 @@ -50,10 +69,8 @@ list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "DOUBLE" "" 1) list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) -#DBLASOBJS += \ -# dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) dsymv_U$(TSUFFIX).$(SUFFIX) dsymv_L$(TSUFFIX).$(SUFFIX) \ -# dger_k$(TSUFFIX).$(SUFFIX) - +GenerateNamedObjects("generic/ger.c" "DOUBLE" "" "ger_k") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # Makefile.L3 #DKERNELOBJS += \ From 7fa5c4e2fd383e75f4e0c74dba0de2efc2b66378 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Sun, 8 Feb 2015 15:29:18 -0600 Subject: [PATCH 058/257] Fixed some case issues with ARCH. Added some kernel and driver/others objects. --- cmake/c_check.cmake | 14 ++++++++++---- driver/others/CMakeLists.txt | 22 +++++++++++++--------- kernel/CMakeLists.txt | 8 ++++++++ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index aaa3da7bc..e32c18a43 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -56,12 +56,16 @@ endif () # TODO: CMAKE_SYSTEM_PROCESSOR doesn't seem to be correct - instead get it from the compiler a la c_check set(ARCH ${CMAKE_SYSTEM_PROCESSOR}) if (${ARCH} STREQUAL "AMD64") - set(ARCH "X86_64") + set(ARCH "x86_64") endif () # If you are using a 32-bit compiler on a 64-bit system CMAKE_SYSTEM_PROCESSOR will be wrong -if (${ARCH} STREQUAL "X86_64" AND BINARY EQUAL 32) - set(ARCH X86) +if (${ARCH} STREQUAL "x86_64" AND BINARY EQUAL 32) + set(ARCH x86) +endif () + +if (${ARCH} STREQUAL "X86") + set(ARCH x86) endif () set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) @@ -69,9 +73,11 @@ if (${COMPILER_ID} STREQUAL "GNU") set(COMPILER_ID "GCC") endif () +string(TOUPPER ${ARCH} UC_ARCH) + file(WRITE ${TARGET_CONF} "#define OS_${HOST_OS}\t1\n" - "#define ARCH_${ARCH}\t1\n" + "#define ARCH_${UC_ARCH}\t1\n" "#define C_${COMPILER_ID}\t1\n" "#define __${BINARY}BIT__\t1\n" "#define FUNDERSCORE\t${FU}\n") diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 0d2a2f486..e14a916b2 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -29,9 +29,7 @@ if (SMP) endif () endif () -add_library(COMMON_OBJS OBJECT - ${MEMORY} - ${SMP_SOURCES} +set(COMMON_SOURCES xerbla.c abs.c # TODO: this is split into c_abs (DOUBLE unset) and z_abs (DOUBLE set) in the Makefile openblas_set_num_threads.c @@ -40,12 +38,12 @@ add_library(COMMON_OBJS OBJECT openblas_error_handle.c ) -#ifeq ($(DYNAMIC_ARCH), 1) -#COMMONOBJS += dynamic.$(SUFFIX) -#else -#COMMONOBJS += parameter.$(SUFFIX) -#endif -# +if (DYNAMIC_ARCH) + list(APPEND COMMON_SOURCES dynamic.c) +else () + list(APPEND COMMON_SOURCES parameter.c) +endif () + #ifdef EXPRECISION #COMMONOBJS += x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX) #endif @@ -62,6 +60,12 @@ add_library(COMMON_OBJS OBJECT #COMMONOBJS += profile.$(SUFFIX) #endif +add_library(COMMON_OBJS OBJECT + ${MEMORY} + ${SMP_SOURCES} + ${COMMON_SOURCES} +) + list(APPEND DBLAS_OBJS "COMMON_OBJS") #LIBOTHERS = libothers.$(LIBSUFFIX) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 5615bf2c2..a6eb14e73 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -3,6 +3,14 @@ include_directories(${CMAKE_SOURCE_DIR}) # TODO: need to read ${KERNELDIR}/KERNEL into CMake vars +# Makeflie + +message(STATUS "${ARCH}") +if (${ARCH} STREQUAL "x86") + GenerateNamedObjects("${KERNELDIR}/cpuid.S" "") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +endif () + # Makefile.L1 From 4c65afcce175bd93041b85f1bd1cfa128ade12e2 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 9 Feb 2015 09:52:14 -0600 Subject: [PATCH 059/257] Changed kernel filenames to vars. These will need to be read from KERNEL. Added some kernel/L3 objects. --- kernel/CMakeLists.txt | 107 +++++++++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 32 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index a6eb14e73..31839b054 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1,11 +1,8 @@ include_directories(${CMAKE_SOURCE_DIR}) -# TODO: need to read ${KERNELDIR}/KERNEL into CMake vars - # Makeflie -message(STATUS "${ARCH}") if (${ARCH} STREQUAL "x86") GenerateNamedObjects("${KERNELDIR}/cpuid.S" "") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) @@ -14,56 +11,74 @@ endif () # Makefile.L1 -# these are using hardcoded filenames for now, should get them from the KERNEL vars, e.g. DAMAXKERNEL instead of amax.S -GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "USE_ABS" "amax_k") +# TODO: need to read ${KERNELDIR}/KERNEL into CMake vars +set(DAMAXKERNEL amax.S) +set(DAMINKERNEL amax.S) +set(DMAXKERNEL amax.S) +set(DMINKERNEL amax.S) +set(IDAMAXKERNEL iamax.S) +set(IDAMINKERNEL iamax.S) +set(IDMAXKERNEL iamax.S) +set(IDMINKERNEL iamax.S) +set(DASUMKERNEL asum.S) +set(DAXPYKERNEL axpy.S) +set(DCOPYKERNEL copy.S) +set(DDOTKERNEL dot.S) +set(DNRM2KERNEL nrm2.S) +set(DROTKERNEL rot.S) +set(DSCALKERNEL scal.S) +set(DSWAPKERNEL swap.S) +set(DAXPBYKERNEL ../arm/axpby.c) + +GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "DOUBLE" "USE_ABS" "amax_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "USE_ABS;USE_MIN" "amin_k") +GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "DOUBLE" "USE_ABS;USE_MIN" "amin_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "" "max_k") +GenerateNamedObjects("${KERNELDIR}/${DMAXKERNEL}" "DOUBLE" "" "max_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/amax.S" "DOUBLE" "" "min_k") +GenerateNamedObjects("${KERNELDIR}/${DMINKERNEL}" "DOUBLE" "" "min_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "USE_ABS" "i*amax_k") +GenerateNamedObjects("${KERNELDIR}/${IDAMAXKERNEL}" "DOUBLE" "USE_ABS" "i*amax_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "USE_ABS;USE_MIN" "i*amin_k") +GenerateNamedObjects("${KERNELDIR}/${IDAMINKERNEL}" "DOUBLE" "USE_ABS;USE_MIN" "i*amin_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "" "i*max_k") +GenerateNamedObjects("${KERNELDIR}/${IDMAXKERNEL}" "DOUBLE" "" "i*max_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/iamax.S" "DOUBLE" "" "i*min_k") +GenerateNamedObjects("${KERNELDIR}/${IDMINKERNEL}" "DOUBLE" "" "i*min_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/asum.S" "DOUBLE" "" "asum_k") +GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "DOUBLE" "" "asum_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/axpy.S" "DOUBLE" "" "axpy_k") +GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "DOUBLE" "" "axpy_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/copy.S" "DOUBLE" "C_INTERFACE" "copy_k") +GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "DOUBLE" "C_INTERFACE" "copy_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/dot.S" "DOUBLE" "" "dot_k") +GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "DOUBLE" "" "dot_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/nrm2.S" "DOUBLE" "" "nrm2_k") +GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "DOUBLE" "" "nrm2_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/rot.S" "DOUBLE" "" "rot_k") +GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "DOUBLE" "" "rot_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/scal.S" "DOUBLE" "" "scal_k") +GenerateNamedObjects("${KERNELDIR}/${DSCALKERNEL}" "DOUBLE" "" "scal_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("${KERNELDIR}/swap.S" "DOUBLE" "" "swap_k") +GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "DOUBLE" "" "swap_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -GenerateNamedObjects("arm/axpby.c" "DOUBLE" "" "axpby_k") +GenerateNamedObjects("${KERNELDIR}/${DAXPBYKERNEL}" "DOUBLE" "" "axpby_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # Makefile.L2 @@ -81,17 +96,45 @@ GenerateNamedObjects("generic/ger.c" "DOUBLE" "" "ger_k") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # Makefile.L3 -#DKERNELOBJS += \ -# dgemm_kernel$(TSUFFIX).$(SUFFIX) \ -# $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ -# $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) - -#DBLASOBJS += \ -# dgemm_beta$(TSUFFIX).$(SUFFIX) \ -# dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ -# dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ -# dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ -# dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + +set(DGEMM_BETA ../generic/gemm_beta.c) +set(DGEMMKERNEL gemm_kernel_2x4_penryn.S) + +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +if (DGEMMINCOPY) + GenerateNamedObjects(${DGEMMINCOPY} "DOUBLE") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +endif () + +if (DGEMMITCOPY) + GenerateNamedObjects(${DGEMMITCOPY} "DOUBLE") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +endif () + +if (DGEMMONCOPY) + GenerateNamedObjects(${DGEMMONCOPY} "DOUBLE") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +endif () + +if (DGEMMOTCOPY) + GenerateNamedObjects(${DGEMMOTCOPY} "DOUBLE") + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +endif () + +GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "DOUBLE") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateCombinationObjects("${KERNELDIR}/${DGEMMKERNEL}" "LEFT;TRANSA" "R;N" "DOUBLE" "TRMMKERNEL" 2 "trmm_kernel") +list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "LT;TRSMKERNEL" "trsm_kernel_LT") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "RT;TRSMKERNEL" "trsm_kernel_RT") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) #DBLASOBJS += \ # dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ From f992799226ca30c363be60864e6b44df696f51eb Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 9 Feb 2015 10:47:35 -0600 Subject: [PATCH 060/257] Added the rest of Makefile.L3. --- kernel/CMakeLists.txt | 128 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 105 insertions(+), 23 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 31839b054..aa38cdc85 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -136,29 +136,111 @@ list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "RT;TRSMKERNEL" "trsm_kernel_RT") list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) -#DBLASOBJS += \ -# dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ -# dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ -# dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ -# dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ -# dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ -# dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ -# dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ -# dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ -# dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ -# dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ -# dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ -# dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ -# dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ -# dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ -# dtrsm_outucopy$(TSUFFIX).$(SUFFIX) dtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ -# dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ -# dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ -# dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX) - -#DBLASOBJS += \ -# domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ -# domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) +# These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. +# Could simplify it a bit by pairing up by -UUNIT/-DUNIT. +GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trmm_iunucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trmm_iunncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trmm_ounucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trmm_ounncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trmm_ilnucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trmm_ilnncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trmm_olnucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trmm_olnncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trmm_iutucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trmm_iutncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trmm_outucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trmm_outncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trmm_iltucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trmm_iltncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trmm_oltucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trmm_oltncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trsm_iunucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trsm_iunncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trsm_ounucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trsm_ounncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trsm_ilnucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trsm_ilnncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trsm_olnucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trsm_olnncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trsm_iutucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trsm_iutncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trsm_outucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trsm_outncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trsm_iltucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trsm_iltncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trsm_oltucopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trsm_oltncopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("generic/symm_ucopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "OUTER" "symm_outcopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/symm_ucopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "" "symm_iutcopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +GenerateNamedObjects("generic/symm_lcopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "LOWER;OUTER" "symm_oltcopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("generic/symm_lcopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "LOWER" "symm_iltcopy") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + +if (NOT DEFINED DOMATCOPY_CN) + set(DOMATCOPY_CN ../arm/omatcopy_cn.c) +endif () +if (NOT DEFINED DOMATCOPY_RN) + set(DOMATCOPY_RN ../arm/omatcopy_rn.c) +endif () +if (NOT DEFINED DOMATCOPY_CT) + set(DOMATCOPY_CT ../arm/omatcopy_ct.c) +endif () +if (NOT DEFINED DOMATCOPY_RT) + set(DOMATCOPY_RT ../arm/omatcopy_rt.c) +endif () + +GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_CN}" "DOUBLE" "" "domatcopy_k_cn") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_RN}" "DOUBLE" "ROWM" "domatcopy_k_rn") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_CT}" "DOUBLE" "" "domatcopy_k_ct") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) +GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_RT}" "DOUBLE" "ROWM" "domatcopy_k_rt") +list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # Makefile.LA #DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) From e8c39138c64815843db1e0e11b349da060d48b52 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 9 Feb 2015 12:28:09 -0600 Subject: [PATCH 061/257] Removed return value from GenerateNamedObjects. It sets DBLAS_OBJS directly to save a bunch of list appending in the CMakeLists.txt files. --- cmake/utils.cmake | 6 +++-- driver/level2/CMakeLists.txt | 12 --------- driver/level3/CMakeLists.txt | 9 ------- interface/CMakeLists.txt | 13 --------- kernel/CMakeLists.txt | 51 ------------------------------------ lapack/CMakeLists.txt | 4 --- 6 files changed, 4 insertions(+), 91 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 2faa2e3e0..9b5d7de14 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -138,7 +138,9 @@ function(GenerateNamedObjects sources_in float_type_in) list(APPEND OBJ_LIST_OUT ${obj_name}) endforeach () - set(OBJ_LIST_OUT ${OBJ_LIST_OUT} PARENT_SCOPE) + + list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) + set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) endfunction () # generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in @@ -220,6 +222,6 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_ endforeach () endforeach () - set(COMBO_OBJ_LIST_OUT ${COMBO_OBJ_LIST_OUT} PARENT_SCOPE) + set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) endfunction () diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index cb8b1c949..d8f8123d3 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -29,34 +29,25 @@ set(NU_SOURCES # objects that need LOWER set GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "DOUBLE" "" 1) -list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) # objects that need TRANSA and UNIT set # N.B. BLAS wants to put the U/L from the filename in the *MIDDLE* because of course why not have a different naming scheme for every single object -hpa GenerateCombinationObjects("${NU_SOURCES}" "TRANSA;UNIT" "N;N" "DOUBLE" "" 3) -list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) # gbmv uses a lowercase n and t. WHY? WHO KNOWS! GenerateNamedObjects("gbmv_k.c" "DOUBLE" "" "gbmv_n") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("gbmv_k.c" "DOUBLE" "TRANS" "gbmv_t") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) if (SMP) # gbmv uses a lowercase n and t. N.B. this uses TRANSA where gbmv.c uses TRANS. Intentional? GenerateNamedObjects("gbmv_thread.c" "DOUBLE" "" "gbmv_thread_n") - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("gbmv_thread.c" "DOUBLE" "TRANSA" "gbmv_thread_t") - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("gemv_thread.c" "DOUBLE" "" "gemv_thread_n") - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("gemv_thread.c" "DOUBLE" "TRANSA" "gemv_thread_t") - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("ger_thread.c" "DOUBLE") - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) set(UL_SMP_SOURCES symv_thread.c @@ -69,8 +60,6 @@ if (SMP) ) GenerateCombinationObjects("${UL_SMP_SOURCES}" "LOWER" "U" "DOUBLE" "" 2) - list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) - set(NU_SMP_SOURCES trmv_thread.c @@ -79,7 +68,6 @@ if (SMP) ) GenerateCombinationObjects("${NU_SMP_SOURCES}" "TRANSA;LOWER;UNIT" "N;U;N" "DOUBLE" "" 2) - list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) endif () diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 687664949..b9a817323 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -27,33 +27,24 @@ set(GEMM_DEFINES NN NT TN TT) foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) GenerateNamedObjects("gemm.c" "DOUBLE" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) GenerateNamedObjects("gemm.c" "DOUBLE" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endif () endforeach () GenerateCombinationObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "N;L;N" "DOUBLE" "" 0) -list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "DOUBLE" "NN" 1) -list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) GenerateCombinationObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "U;N" "DOUBLE" "" 1) -list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "U" "DOUBLE" "" 2) -list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) if (SMP) # N.B. these do NOT have a float type (e.g. DOUBLE) defined! GenerateNamedObjects("gemm_thread_m.c;gemm_thread_n.c;gemm_thread_mn.c;gemm_thread_variable.c;syrk_thread.c" "" "" "" 0) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) if (NOT USE_SIMPLE_THREADED_LEVEL3) GenerateCombinationObjects("syrk_k.c" "LOWER;TRANS" "U;N" "DOUBLE" "THREADED_LEVEL3" 2 "syrk_thread") - list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "DOUBLE" "THREADED_LEVEL3;NN" 2 "symm_thread") - list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) endif () endif () diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 90fca8b8f..1b0ac42d6 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -41,34 +41,22 @@ endif () foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS1_SOURCES}" "DOUBLE" "" "" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("${BLAS2_SOURCES}" "DOUBLE" "" "" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("${BLAS3_SOURCES}" "DOUBLE" "" "" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # trmm is trsm with a compiler flag set GenerateNamedObjects("trsm.c" "DOUBLE" "TRMM" "trmm" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # max and imax are compiled 4 times GenerateNamedObjects("max.c" "DOUBLE" "" "" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("max.c" "DOUBLE" "USE_ABS" "amax" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("max.c" "DOUBLE" "USE_ABS;USE_MIN" "amin" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("max.c" "DOUBLE" "USE_MIN" "min" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("imax.c" "DOUBLE" "" "i*max" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("imax.c" "DOUBLE" "USE_ABS" "i*amax" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("imax.c" "DOUBLE" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("imax.c" "DOUBLE" "USE_MIN" "i*min" ${CBLAS_FLAG}) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endforeach () @@ -79,7 +67,6 @@ if (NOT DEFINED NO_LAPACK) lapack/lauum.c lapack/trti2.c lapack/trtri.c ) GenerateNamedObjects("${LAPACK_SOURCES}" "DOUBLE" "" "" 0) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endif () set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index aa38cdc85..329bf5375 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -101,124 +101,77 @@ set(DGEMM_BETA ../generic/gemm_beta.c) set(DGEMMKERNEL gemm_kernel_2x4_penryn.S) GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) if (DGEMMINCOPY) GenerateNamedObjects(${DGEMMINCOPY} "DOUBLE") - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endif () if (DGEMMITCOPY) GenerateNamedObjects(${DGEMMITCOPY} "DOUBLE") - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endif () if (DGEMMONCOPY) GenerateNamedObjects(${DGEMMONCOPY} "DOUBLE") - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endif () if (DGEMMOTCOPY) GenerateNamedObjects(${DGEMMOTCOPY} "DOUBLE") - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endif () GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "DOUBLE") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateCombinationObjects("${KERNELDIR}/${DGEMMKERNEL}" "LEFT;TRANSA" "R;N" "DOUBLE" "TRMMKERNEL" 2 "trmm_kernel") -list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "LT;TRSMKERNEL" "trsm_kernel_LT") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "RT;TRSMKERNEL" "trsm_kernel_RT") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trmm_iunucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trmm_iunncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trmm_ounucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trmm_ounncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trmm_ilnucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trmm_ilnncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trmm_olnucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trmm_olnncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trmm_iutucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trmm_iutncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trmm_outucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trmm_outncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trmm_iltucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trmm_iltncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trmm_oltucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trmm_oltncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trsm_iunucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trsm_iunncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trsm_ounucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trsm_ounncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trsm_ilnucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trsm_ilnncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trsm_olnucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trsm_olnncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trsm_iutucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trsm_iutncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trsm_outucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trsm_outncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trsm_iltucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trsm_iltncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trsm_oltucopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trsm_oltncopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/symm_ucopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "OUTER" "symm_outcopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/symm_ucopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "" "symm_iutcopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/symm_lcopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "LOWER;OUTER" "symm_oltcopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("generic/symm_lcopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "LOWER" "symm_iltcopy") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) if (NOT DEFINED DOMATCOPY_CN) set(DOMATCOPY_CN ../arm/omatcopy_cn.c) @@ -234,13 +187,9 @@ if (NOT DEFINED DOMATCOPY_RT) endif () GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_CN}" "DOUBLE" "" "domatcopy_k_cn") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_RN}" "DOUBLE" "ROWM" "domatcopy_k_rn") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_CT}" "DOUBLE" "" "domatcopy_k_ct") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_RT}" "DOUBLE" "ROWM" "domatcopy_k_rt") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # Makefile.LA #DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 664ce6d6e..5070e0c05 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -50,7 +50,6 @@ set(ZLAPACK_SOURCES ) GenerateNamedObjects("${LAPACK_SOURCES}" "DOUBLE" "" "" 0) -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) if (SMP) @@ -89,13 +88,10 @@ if (SMP) ) GenerateNamedObjects("${PARALLEL_SOURCES}" "DOUBLE" "" "" 0) - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endif () GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "DOUBLE" "" 4) -list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "DOUBLE" "" 4) -list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From 4bfaf1ce6612542ce71f668a1901005da18792c1 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 9 Feb 2015 12:56:55 -0600 Subject: [PATCH 062/257] Removed some list appends I missed. --- kernel/CMakeLists.txt | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 329bf5375..522fac349 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -5,7 +5,6 @@ include_directories(${CMAKE_SOURCE_DIR}) if (${ARCH} STREQUAL "x86") GenerateNamedObjects("${KERNELDIR}/cpuid.S" "") - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) endif () @@ -31,69 +30,29 @@ set(DSWAPKERNEL swap.S) set(DAXPBYKERNEL ../arm/axpby.c) GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "DOUBLE" "USE_ABS" "amax_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "DOUBLE" "USE_ABS;USE_MIN" "amin_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DMAXKERNEL}" "DOUBLE" "" "max_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DMINKERNEL}" "DOUBLE" "" "min_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${IDAMAXKERNEL}" "DOUBLE" "USE_ABS" "i*amax_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${IDAMINKERNEL}" "DOUBLE" "USE_ABS;USE_MIN" "i*amin_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${IDMAXKERNEL}" "DOUBLE" "" "i*max_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${IDMINKERNEL}" "DOUBLE" "" "i*min_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "DOUBLE" "" "asum_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "DOUBLE" "" "axpy_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "DOUBLE" "C_INTERFACE" "copy_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "DOUBLE" "" "dot_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "DOUBLE" "" "nrm2_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "DOUBLE" "" "rot_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DSCALKERNEL}" "DOUBLE" "" "scal_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "DOUBLE" "" "swap_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/${DAXPBYKERNEL}" "DOUBLE" "" "axpby_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # Makefile.L2 GenerateNamedObjects("${KERNELDIR}/gemv_n.S" "DOUBLE") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateNamedObjects("${KERNELDIR}/gemv_t.S" "DOUBLE" "TRANS") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "DOUBLE" "" 1) -list(APPEND DBLAS_OBJS ${COMBO_OBJ_LIST_OUT}) - GenerateNamedObjects("generic/ger.c" "DOUBLE" "" "ger_k") -list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) # Makefile.L3 From c0624a26befe30f506a89b39dd5fc2ace726d3a0 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 9 Feb 2015 14:34:29 -0600 Subject: [PATCH 063/257] Fixed some dgemm_copy function names. --- kernel/CMakeLists.txt | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 522fac349..8bc325f17 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -56,25 +56,34 @@ GenerateNamedObjects("generic/ger.c" "DOUBLE" "" "ger_k") # Makefile.L3 +# TODO: these are from KERNEL.PENRYN - they should be read in from the appropriate ${KERNELDIR}/KERNEL file set(DGEMM_BETA ../generic/gemm_beta.c) set(DGEMMKERNEL gemm_kernel_2x4_penryn.S) +set(DGEMMINCOPY gemm_ncopy_2.S) +set(DGEMMITCOPY gemm_tcopy_2.S) +set(DGEMMONCOPY ../generic/gemm_ncopy_4.c) +set(DGEMMOTCOPY ../generic/gemm_tcopy_4.c) +set(DGEMMINCOPYOBJ gemm_incopy) +set(DGEMMITCOPYOBJ gemm_itcopy) +set(DGEMMONCOPYOBJ gemm_oncopy) +set(DGEMMOTCOPYOBJ gemm_otcopy) -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE") +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "" "gemm_kernel") if (DGEMMINCOPY) - GenerateNamedObjects(${DGEMMINCOPY} "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "DOUBLE" "" "${DGEMMINCOPYOBJ}") endif () if (DGEMMITCOPY) - GenerateNamedObjects(${DGEMMITCOPY} "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMMITCOPY}" "DOUBLE" "" "${DGEMMITCOPYOBJ}") endif () if (DGEMMONCOPY) - GenerateNamedObjects(${DGEMMONCOPY} "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMMONCOPY}" "DOUBLE" "" "${DGEMMONCOPYOBJ}") endif () if (DGEMMOTCOPY) - GenerateNamedObjects(${DGEMMOTCOPY} "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "DOUBLE" "" "${DGEMMOTCOPYOBJ}") endif () GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "DOUBLE") From 6ddbfea7004ab3c16e35491c2f8ac066e09ffa7a Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 9 Feb 2015 15:15:58 -0600 Subject: [PATCH 064/257] Added generic laswp object. --- lapack/CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 5070e0c05..7f37d49b3 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -1,7 +1,6 @@ include_directories(${CMAKE_SOURCE_DIR}) -# TODO: laswp needs arch specific code # TODO: getrs needs to be compiled with and without TRANS (and up to TRANS=4 in the complex case) # TODO: trti2 needs to be compiled with and without UNIT @@ -49,7 +48,11 @@ set(ZLAPACK_SOURCES trtri/trtri_L_single.c ) -GenerateNamedObjects("${LAPACK_SOURCES}" "DOUBLE" "" "" 0) +GenerateNamedObjects("${LAPACK_SOURCES}" "DOUBLE") + +# TODO: laswp needs arch specific code +GenerateNamedObjects("laswp/generic/laswp_k.c" "DOUBLE" "" "laswp_plus") +GenerateNamedObjects("laswp/generic/laswp_k.c" "DOUBLE" "MINUS" "laswp_minus") if (SMP) From 3b20b62423150ea0eb2a4e2c9d296a527c30e827 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 9 Feb 2015 15:29:28 -0600 Subject: [PATCH 065/257] Fixed trti2 name. --- lapack/CMakeLists.txt | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 7f37d49b3..346f96e34 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -1,8 +1,6 @@ include_directories(${CMAKE_SOURCE_DIR}) -# TODO: getrs needs to be compiled with and without TRANS (and up to TRANS=4 in the complex case) -# TODO: trti2 needs to be compiled with and without UNIT set(LAPACK_SOURCES getf2/getf2_k.c @@ -15,8 +13,6 @@ set(LAPACK_SOURCES lauu2/lauu2_L.c lauum/lauum_U_single.c lauum/lauum_L_single.c - trti2/trti2_U.c - trti2/trti2_L.c ) # sources that need TRANS set @@ -30,6 +26,12 @@ set(UNIT_SOURCES trtri/trtri_L_single.c ) +set(UNIT_SOURCES2 + trti2/trti2_U.c + trti2/trti2_L.c +) + +# TODO: getrs needs to be compiled with up to TRANS=4 in the complex case set(ZLAPACK_SOURCES getf2/zgetf2_k.c getrf/getrf_single.c @@ -95,6 +97,7 @@ endif () GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "DOUBLE" "" 4) GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "DOUBLE" "" 4) +GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "DOUBLE" "" 0) set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From 96cf6779ca9f28490affebf3a74ccf90cc8e8246 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 10 Feb 2015 11:01:01 -0600 Subject: [PATCH 066/257] Added DLA sources from lapack-netlib. Can't use the lapack-netlib cmake files, since they are designed to build a complete lapack/blas library. They have their own fortran detection and flag setup and so on. Instead I'll just recreate the makefiles I need. Fixed a typo in the NAME defines. --- CMakeLists.txt | 13 +++++++++++++ cmake/utils.cmake | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 41cb52b23..b8e1d6ad1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,6 +66,19 @@ foreach (DBLAS_OBJ ${DBLAS_OBJS}) set_target_properties(${DBLAS_OBJ} PROPERTIES COMPILE_DEFINITIONS "${PREV_DEFS};DOUBLE") list(APPEND TARGET_OBJS "$") endforeach () + +# netlib: + +# Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. + +# N.B. if a source is added or removed you MUST re-run the cmake command manually; make will not do it. +file(GLOB DLA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/d*.f") + +add_library(DLA_OBJ OBJECT ${DLA_SOURCES}) +set_target_properties(${DLA_OBJ} PROPERTIES COMPILE_FLAGS ${LAPACK_FFLAGS}) +list(APPEND TARGET_OBJS "$") + +# add objects to the openblas lib add_library(openblas ${TARGET_OBJS}) # TODO: Why is the config saved here? Is this necessary with CMake? diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 9b5d7de14..a6952c833 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -128,7 +128,7 @@ function(GenerateNamedObjects sources_in float_type_in) list(APPEND obj_defines "CBLAS") endif () - list(APPEND obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") + list(APPEND obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CHAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") list(APPEND obj_defines ${defines_in}) list(APPEND obj_defines ${float_type_in}) From 8743093bd7b6e87c27564c5ca01c77ee0a655aa8 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 10 Feb 2015 11:47:46 -0600 Subject: [PATCH 067/257] Added aux files from lapack-netlib. --- CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b8e1d6ad1..d82ef61ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,16 @@ endforeach () # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. +# aux files +set(ALLAUX + ${NETLIB_LAPACK_DIR}/SRC/ilaenv.f ${NETLIB_LAPACK_DIR}/SRC/ieeeck.f ${NETLIB_LAPACK_DIR}/SRC/lsamen.f ${NETLIB_LAPACK_DIR}/SRC/xerbla_array.f ${NETLIB_LAPACK_DIR}/SRC/iparmq.f + ${NETLIB_LAPACK_DIR}/SRC/ilaprec.f ${NETLIB_LAPACK_DIR}/SRC/ilatrans.f ${NETLIB_LAPACK_DIR}/SRC/ilauplo.f ${NETLIB_LAPACK_DIR}/SRC/iladiag.f ${NETLIB_LAPACK_DIR}/SRC/chla_transtype.f + ${NETLIB_LAPACK_DIR}/INSTALL/ilaver.f ${NETLIB_LAPACK_DIR}/INSTALL/slamch.f +) +add_library(AUX_OBJ OBJECT ${ALLAUX}) +set_target_properties(${AUX_OBJ} PROPERTIES COMPILE_FLAGS ${LAPACK_FFLAGS}) +list(APPEND TARGET_OBJS "$") + # N.B. if a source is added or removed you MUST re-run the cmake command manually; make will not do it. file(GLOB DLA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/d*.f") From 162791e30e5c725360d5c0de4c5c38f7643d4055 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 10 Feb 2015 12:42:05 -0600 Subject: [PATCH 068/257] Added common objects from kernel Makefile. --- cmake/utils.cmake | 4 +++- kernel/CMakeLists.txt | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index a6952c833..672dcad33 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -130,7 +130,9 @@ function(GenerateNamedObjects sources_in float_type_in) list(APPEND obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CHAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") list(APPEND obj_defines ${defines_in}) - list(APPEND obj_defines ${float_type_in}) + if (NOT ${float_type_in} STREQUAL "SINGLE") + list(APPEND obj_defines ${float_type_in}) + endif () add_library(${obj_name} OBJECT ${source_file}) set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${obj_defines}") diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 8bc325f17..a83bd0dbe 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -7,6 +7,14 @@ if (${ARCH} STREQUAL "x86") GenerateNamedObjects("${KERNELDIR}/cpuid.S" "") endif () +# TODO: Read from ${KERNELDIR}/KERNEL - some architectures use a different lsame +set(LSAME_KERNEL lsame.S) +set(SCABS_KERNEL cabs.S) +set(DCABS_KERNEL cabs.S) +GenerateNamedObjects("${KERNELDIR}/${LSAME_KERNEL}" "" "F_INTERFACE" "lsame") +GenerateNamedObjects("${KERNELDIR}/${SCABS_KERNEL}" "SINGLE" "COMPLEX;F_INTERFACE" "cabs1") +GenerateNamedObjects("${KERNELDIR}/${DCABS_KERNEL}" "DOUBLE" "COMPLEX;F_INTERFACE" "cabs1") + # Makefile.L1 From 64b5a0ef84c5ec1f0b7a7bf083a1b20dd015ed77 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 10 Feb 2015 14:29:05 -0600 Subject: [PATCH 069/257] Added AUX files from lapack-netlib. --- CMakeLists.txt | 24 +++-------- cmake/fc.cmake | 7 ++++ cmake/lapack.cmake | 100 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 18 deletions(-) create mode 100644 cmake/lapack.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index d82ef61ea..9db677d35 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${Open enable_language(Fortran) enable_language(ASM) -message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with.") +message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only DOUBLE and x86 support is currently available.") include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake") include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") @@ -70,23 +70,11 @@ endforeach () # netlib: # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. - -# aux files -set(ALLAUX - ${NETLIB_LAPACK_DIR}/SRC/ilaenv.f ${NETLIB_LAPACK_DIR}/SRC/ieeeck.f ${NETLIB_LAPACK_DIR}/SRC/lsamen.f ${NETLIB_LAPACK_DIR}/SRC/xerbla_array.f ${NETLIB_LAPACK_DIR}/SRC/iparmq.f - ${NETLIB_LAPACK_DIR}/SRC/ilaprec.f ${NETLIB_LAPACK_DIR}/SRC/ilatrans.f ${NETLIB_LAPACK_DIR}/SRC/ilauplo.f ${NETLIB_LAPACK_DIR}/SRC/iladiag.f ${NETLIB_LAPACK_DIR}/SRC/chla_transtype.f - ${NETLIB_LAPACK_DIR}/INSTALL/ilaver.f ${NETLIB_LAPACK_DIR}/INSTALL/slamch.f -) -add_library(AUX_OBJ OBJECT ${ALLAUX}) -set_target_properties(${AUX_OBJ} PROPERTIES COMPILE_FLAGS ${LAPACK_FFLAGS}) -list(APPEND TARGET_OBJS "$") - -# N.B. if a source is added or removed you MUST re-run the cmake command manually; make will not do it. -file(GLOB DLA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/d*.f") - -add_library(DLA_OBJ OBJECT ${DLA_SOURCES}) -set_target_properties(${DLA_OBJ} PROPERTIES COMPILE_FLAGS ${LAPACK_FFLAGS}) -list(APPEND TARGET_OBJS "$") +# Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. +include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") +add_library(LA_OBJ OBJECT ${LA_SOURCES}) +set_target_properties(LA_OBJ PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") +list(APPEND TARGET_OBJS "$") # add objects to the openblas lib add_library(openblas ${TARGET_OBJS}) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index a47865b63..ba156c210 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -191,3 +191,10 @@ if (${F_COMPILER} STREQUAL "COMPAQ") endif () endif () +# from the root Makefile - this is for lapack-netlib to compile the correct secnd file. +if (${F_COMPILER} STREQUAL "GFORTRAN") + set(TIMER "INT_ETIME") +else () + set(TIMER "NONE") +endif () + diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake new file mode 100644 index 000000000..14581a9b2 --- /dev/null +++ b/cmake/lapack.cmake @@ -0,0 +1,100 @@ +# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. + +set(ALLAUX + ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f + ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f + ../INSTALL/ilaver.f ../INSTALL/slamch.f +) + +set(DZLAUX + dbdsdc.f + dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f + dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f + dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f + dlagts.f dlamrg.f dlanst.f + dlapy2.f dlapy3.f dlarnv.f + dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f + dlarrk.f dlarrr.f dlaneg.f + dlartg.f dlaruv.f dlas2.f dlascl.f + dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f + dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f + dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f + dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f + dsteqr.f dsterf.f dlaisnan.f disnan.f + dlartgp.f dlartgs.f + ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f +) + +set(DLASRC + dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f + dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f + dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f + dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f + dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f + dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f + dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f + dgetc2.f dgetri.f + dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f + dggglm.f dgghrd.f dgglse.f dggqrf.f + dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f + dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f + dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f + dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f + dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f + dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f + dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f + dlapll.f dlapmt.f + dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f + dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f + dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f + dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f + dlargv.f dlarrv.f dlartv.f + dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f + dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f + dopgtr.f dopmtr.f dorg2l.f dorg2r.f + dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f + dorgrq.f dorgtr.f dorm2l.f dorm2r.f + dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f + dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f + dpbstf.f dpbsv.f dpbsvx.f + dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f + dposvx.f dpotrs.f dpstrf.f dpstf2.f + dppcon.f dppequ.f + dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f + dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f + dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f + dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f + dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f + dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f + dstevx.f + dsycon.f dsyev.f dsyevd.f dsyevr.f + dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f + dsysv.f dsysvx.f + dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytri2.f dsytri2x.f + dsyswapr.f dsytrs.f dsytrs2.f dsyconv.f + dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f + dsytri_rook.f dsycon_rook.f dsysv_rook.f + dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f + dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f + dtptrs.f + dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f + dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f + dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f + dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f + dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f + dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f + dgeequb.f dsyequb.f dpoequb.f dgbequb.f + dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f + dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f + dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f + dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f +) + +set(LA_REL_SRC ${ALLAUX} ${DZLAUX} ${DLASRC}) + +# add lapack-netlib folder to the sources +set(LA_SOURCES "") +foreach (LA_FILE ${LA_REL_SRC}) + list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}") +endforeach () + From d60b49e5c564ec7e9d1a159af86833730a27c9c7 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 10 Feb 2015 14:36:43 -0600 Subject: [PATCH 070/257] Turned off uninizialized variable warning when compiling lapack-netlib. --- cmake/system.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 2a0678f83..3d58fa2e3 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -342,6 +342,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") endforeach () endif () +if (${F_COMPILER} STREQUAL "GFORTRAN") + # lapack-netlib is rife with uninitialized warnings -hpa + set(LAPACK_FFLAGS "${LAPACK_FFLAGS} -Wno-maybe-uninitialized") +endif () + set(LAPACK_CFLAGS "${CMAKE_C_CFLAGS} -DHAVE_LAPACK_CONFIG_H") if (INTERFACE64) set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_ILP64") From c94fe71278ad31606a84699f33832779a38f2520 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 11 Feb 2015 10:54:14 -0600 Subject: [PATCH 071/257] Removed incoming-stack-boundary for MSVC. Made float type optional for GenerateNamedObjects. Called GenerateNamedObjects for a couple of driver/others files that needed NAME/CNAME set. --- cmake/os.cmake | 4 +++- cmake/utils.cmake | 8 ++++++-- driver/others/CMakeLists.txt | 8 +++++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/cmake/os.cmake b/cmake/os.cmake index cf36ef62f..eb7df31ed 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -57,7 +57,9 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") # Ensure the correct stack alignment on Win32 # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 if (${ARCH} STREQUAL "x86") - set(CCOMMON_OPT "${CCOMMON_OPT} -mincoming-stack-boundary=2") + if (NOT MSVC) + set(CCOMMON_OPT "${CCOMMON_OPT} -mincoming-stack-boundary=2") + endif () set(FCOMMON_OPT "${FCOMMON_OPT} -mincoming-stack-boundary=2") endif () diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 672dcad33..715f298b7 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -70,7 +70,11 @@ endfunction () # e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax" # @param replace_last_with replaces the last character in the filename with this string (e.g. symm_k should be symm_TU) # @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters) -function(GenerateNamedObjects sources_in float_type_in) +function(GenerateNamedObjects sources_in) + + if (DEFINED ARGV1) + set(float_type_in ${ARGV1}) + endif () if (DEFINED ARGV2) set(defines_in ${ARGV2}) @@ -97,7 +101,7 @@ function(GenerateNamedObjects sources_in float_type_in) set(OBJ_LIST_OUT "") foreach (source_file ${sources_in}) - if (NOT float_type_in STREQUAL "") + if (DEFINED float_type_in AND NOT float_type_in STREQUAL "") string(SUBSTRING ${float_type_in} 0 1 float_char) string(TOLOWER ${float_char} float_char) endif () diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index e14a916b2..7f8672eb2 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -31,13 +31,15 @@ endif () set(COMMON_SOURCES xerbla.c - abs.c # TODO: this is split into c_abs (DOUBLE unset) and z_abs (DOUBLE set) in the Makefile openblas_set_num_threads.c - openblas_get_config.c - openblas_get_parallel.c openblas_error_handle.c ) +# these need to have NAME/CNAME set, so use GenerateNamedObjects +GenerateNamedObjects("abs.c" "" "" "c_abs") +GenerateNamedObjects("abs.c" "" "DOUBLE" "z_abs") +GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c") + if (DYNAMIC_ARCH) list(APPEND COMMON_SOURCES dynamic.c) else () From 5d3fc092e9eae1982ae9947066546131c57f2fbe Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 11 Feb 2015 11:10:45 -0600 Subject: [PATCH 072/257] Added MSVC defines to common.h. Don't have unistd.h in MSVC. Chagned YIELDING to use the YeildProcessor macro. --- common.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common.h b/common.h index fe2083469..0761f5536 100644 --- a/common.h +++ b/common.h @@ -82,7 +82,10 @@ extern "C" { #include #include #include + +#if !defined(_MSC_VER) #include +#endif #ifdef OS_LINUX #include @@ -307,8 +310,12 @@ typedef int blasint; #endif #if defined(OS_WINDOWS) +#ifdef _MSC_VER +#define YIELDING YieldProcessor() +#else #define YIELDING SwitchToThread() #endif +#endif #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5) #define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); From a0d9a7fd833f5c5e14b2b30d9518f6a70c40c3f3 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 11 Feb 2015 11:11:47 -0600 Subject: [PATCH 073/257] Changed _Complex types in common_level1.h to use the typedef. --- common_level1.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/common_level1.h b/common_level1.h index 2a1b4f1cf..32ffd6f18 100644 --- a/common_level1.h +++ b/common_level1.h @@ -47,12 +47,12 @@ double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); -float _Complex cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); -float _Complex cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); -double _Complex zdotc_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); -double _Complex zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); -xdouble _Complex xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); -xdouble _Complex xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +openblas_complex_float cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); +openblas_complex_float cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); +openblas_complex_double zdotc_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); +openblas_complex_double zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); +openblas_complex_xdouble xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +openblas_complex_xdouble xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int saxpy_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); From 056ba2675556722bb5180f5b4cab93559267b516 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 11 Feb 2015 11:13:17 -0600 Subject: [PATCH 074/257] Changed a number of inline calls to use __inline. MSVC doesn't inmplement C99, so can't use the inline keyword. __inline appears to work in MSVC and GCC. --- common_x86.h | 2 +- driver/level3/syr2k_k.c | 2 +- driver/level3/syrk_k.c | 2 +- lapack/getrf/getrf_parallel.c | 2 +- symcopy.h | 32 ++++++++++++++++---------------- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/common_x86.h b/common_x86.h index 9d82090cc..f096e9074 100644 --- a/common_x86.h +++ b/common_x86.h @@ -100,7 +100,7 @@ void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); #define WHEREAMI -static inline int WhereAmI(void){ +static __inline int WhereAmI(void){ int eax, ebx, ecx, edx; int apicid; diff --git a/driver/level3/syr2k_k.c b/driver/level3/syr2k_k.c index 8df0f122f..09131fbdb 100644 --- a/driver/level3/syr2k_k.c +++ b/driver/level3/syr2k_k.c @@ -47,7 +47,7 @@ #endif #endif -static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { +static __inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { BLASLONG i; diff --git a/driver/level3/syrk_k.c b/driver/level3/syrk_k.c index 08751dc8b..8bc817f87 100644 --- a/driver/level3/syrk_k.c +++ b/driver/level3/syrk_k.c @@ -49,7 +49,7 @@ #endif #endif -static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { +static __inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { BLASLONG i; diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index a76be3ba7..b4f33583f 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -67,7 +67,7 @@ double sqrt(double); #undef GETRF_FACTOR #define GETRF_FACTOR 1.00 -static inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { +static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { double m = (double)(M - IS - BK); double n = (double)(N - IS - BK); diff --git a/symcopy.h b/symcopy.h index 48ccbd369..16172c046 100644 --- a/symcopy.h +++ b/symcopy.h @@ -43,7 +43,7 @@ #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) -static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -141,7 +141,7 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -232,7 +232,7 @@ static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } -static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -362,7 +362,7 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -486,7 +486,7 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -613,7 +613,7 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -735,7 +735,7 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } -static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -862,7 +862,7 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -984,7 +984,7 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } -static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1082,7 +1082,7 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1180,7 +1180,7 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1270,7 +1270,7 @@ static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1360,7 +1360,7 @@ static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1490,7 +1490,7 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1620,7 +1620,7 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1744,7 +1744,7 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; From e74462a3f5d83853e39ac78404ba579fc9d54ea0 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 11 Feb 2015 11:16:57 -0600 Subject: [PATCH 075/257] Moved declarations to start of functions to satisfy MSVC C89 implementation. --- driver/level3/trmm_R.c | 5 +++-- driver/level3/trsm_L.c | 5 +++-- driver/level3/trsm_R.c | 5 +++-- interface/gemm.c | 14 ++++++++++---- interface/gemv.c | 12 +++++++++--- lapack/getrf/getrf_parallel.c | 4 ++++ 6 files changed, 32 insertions(+), 13 deletions(-) diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index bdd9370cd..0882aa496 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -70,6 +70,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; +#if !((!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA))) + BLASLONG start_ls; +#endif m = args -> m; n = args -> n; @@ -226,8 +229,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } #else - BLASLONG start_ls; - for(js = n; js > 0; js -= GEMM_R){ min_j = js; if (min_j > GEMM_R) min_j = GEMM_R; diff --git a/driver/level3/trsm_L.c b/driver/level3/trsm_L.c index 78da0eb6c..d8130ee7e 100644 --- a/driver/level3/trsm_L.c +++ b/driver/level3/trsm_L.c @@ -76,6 +76,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; +#if !((!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA))) + BLASLONG start_is; +#endif m = args -> m; n = args -> n; @@ -178,8 +181,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } } #else - BLASLONG start_is; - for(ls = m; ls > 0; ls -= GEMM_Q){ min_l = ls; if (min_l > GEMM_Q) min_l = GEMM_Q; diff --git a/driver/level3/trsm_R.c b/driver/level3/trsm_R.c index 169441d1e..f6a57f93f 100644 --- a/driver/level3/trsm_R.c +++ b/driver/level3/trsm_R.c @@ -75,6 +75,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; +#if !((defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA))) + BLASLONG start_ls; +#endif m = args -> m; n = args -> n; @@ -226,8 +229,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } #else - BLASLONG start_ls; - for(js = n; js > 0; js -= GEMM_R){ min_j = js; if (min_j > GEMM_R) min_j = GEMM_R; diff --git a/interface/gemm.c b/interface/gemm.c index a5a2b4724..7253b0500 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -121,6 +121,9 @@ void NAME(char *TRANSA, char *TRANSB, FLOAT *sa, *sb; #ifdef SMP + int nthreads_max; + int nthreads_avail; + double MNK; #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -237,6 +240,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS XFLOAT *sa, *sb; #ifdef SMP + int nthreads_max; + int nthreads_avail; + double MNK; #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -400,15 +406,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); - int nthreads_max = num_cpu_avail(3); - int nthreads_avail = nthreads_max; + nthreads_max = num_cpu_avail(3); + nthreads_avail = nthreads_max; #ifndef COMPLEX - double MNK = (double) args.m * (double) args.n * (double) args.k; + MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; #else - double MNK = (double) args.m * (double) args.n * (double) args.k; + MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; #endif diff --git a/interface/gemv.c b/interface/gemv.c index 2dd82dce5..638329a2c 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -80,6 +80,9 @@ void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *buffer; #ifdef SMP int nthreads; + int nthreads_max; + int nthreads_avail; + double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { @@ -134,6 +137,9 @@ void CNAME(enum CBLAS_ORDER order, blasint info, t; #ifdef SMP int nthreads; + int nthreads_max; + int nthreads_avail; + double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { @@ -212,10 +218,10 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP - int nthreads_max = num_cpu_avail(2); - int nthreads_avail = nthreads_max; + nthreads_max = num_cpu_avail(2); + nthreads_avail = nthreads_max; - double MNK = (double) m * (double) n; + MNK = (double) m * (double) n; if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) nthreads_max = 1; diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index b4f33583f..8fdf76987 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -373,7 +373,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG num_cpu; +#ifdef _MSC_VER + BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; +#else volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); +#endif #ifndef COMPLEX #ifdef XDOUBLE From 4662a0b13aad3783645cbaa5f8035701b921d61f Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Sun, 15 Feb 2015 17:44:37 -0600 Subject: [PATCH 076/257] Changed generate functions to iterate through a list of float types. This will generate obj files for SINGLE/DOUBLE/COMPLEX/DOUBLE COMPLEX. --- CMakeLists.txt | 36 +++++++- cmake/os.cmake | 2 +- cmake/system.cmake | 6 +- cmake/utils.cmake | 109 +++++++++++++---------- driver/level2/CMakeLists.txt | 22 ++--- driver/level3/CMakeLists.txt | 18 ++-- driver/others/CMakeLists.txt | 8 +- interface/CMakeLists.txt | 26 +++--- kernel/CMakeLists.txt | 167 ++++++++++++++++++----------------- lapack/CMakeLists.txt | 14 +-- 10 files changed, 228 insertions(+), 180 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9db677d35..5dd811959 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${Open enable_language(Fortran) enable_language(ASM) -message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only DOUBLE and x86 support is currently available.") +message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake") include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") @@ -36,6 +36,36 @@ if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () +# set which float types we want to build for +if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) + # if none are defined, build for all + set(BUILD_SINGLE true) + set(BUILD_DOUBLE true) + set(BUILD_COMPLEX true) + set(BUILD_COMPLEX16 true) +endif () + +set(FLOAT_TYPES "") +if (BUILD_SINGLE) + message(STATUS "Building Single Precision") + list(APPEND FLOAT_TYPES "SINGLE") # defines nothing +endif () + +if (BUILD_DOUBLE) + message(STATUS "Building Double Precision") + list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE +endif () + +if (BUILD_COMPLEX) + message(STATUS "Building Complex Precision") + list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX +endif () + +if (BUILD_COMPLEX16) + message(STATUS "Building Double Complex Precision") + list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE +endif () + set(SUBDIRS_ALL ${SUBDIRS} test ctest utest exports benchmark ../laswp ../bench) # all :: libs netlib tests shared @@ -62,8 +92,8 @@ endforeach () # get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) set(TARGET_OBJS "") foreach (DBLAS_OBJ ${DBLAS_OBJS}) - get_target_property(PREV_DEFS ${DBLAS_OBJ} COMPILE_DEFINITIONS) - set_target_properties(${DBLAS_OBJ} PROPERTIES COMPILE_DEFINITIONS "${PREV_DEFS};DOUBLE") + #get_target_property(PREV_DEFS ${DBLAS_OBJ} COMPILE_DEFINITIONS) + #set_target_properties(${DBLAS_OBJ} PROPERTIES COMPILE_DEFINITIONS "${PREV_DEFS};DOUBLE") list(APPEND TARGET_OBJS "$") endforeach () diff --git a/cmake/os.cmake b/cmake/os.cmake index eb7df31ed..f5a75027c 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -57,7 +57,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") # Ensure the correct stack alignment on Win32 # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 if (${ARCH} STREQUAL "x86") - if (NOT MSVC) + if (NOT MSVC AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") set(CCOMMON_OPT "${CCOMMON_OPT} -mincoming-stack-boundary=2") endif () set(FCOMMON_OPT "${FCOMMON_OPT} -mincoming-stack-boundary=2") diff --git a/cmake/system.cmake b/cmake/system.cmake index 3d58fa2e3..cc7373e47 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -32,7 +32,7 @@ if (DEFINED TARGET) set(GETARCH_FLAGS "-DFORCE_${TARGET}") endif () -if (${INTERFACE64}) +if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") endif () @@ -43,12 +43,12 @@ endif () message(STATUS "GEMM multithread threshold set to ${GEMM_MULTITHREAD_THRESHOLD}.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DGEMM_MULTITHREAD_THRESHOLD=${GEMM_MULTITHREAD_THRESHOLD}") -if (${NO_AVX}) +if (NO_AVX) message(STATUS "Disabling Advanced Vector Extensions (AVX).") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX") endif () -if (${NO_AVX2}) +if (NO_AVX2) message(STATUS "Disabling Advanced Vector Extensions 2 (AVX2).") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX2") endif () diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 715f298b7..81083a19f 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -64,85 +64,98 @@ endfunction () # generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition # @param sources_in the source files to build from -# @param float_type_in the float type to define for this build (e.g. SINGLE/DOUBLE/etc) # @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. # e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax" # @param replace_last_with replaces the last character in the filename with this string (e.g. symm_k should be symm_TU) # @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters) +# @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc) function(GenerateNamedObjects sources_in) if (DEFINED ARGV1) - set(float_type_in ${ARGV1}) + set(defines_in ${ARGV1}) endif () if (DEFINED ARGV2) - set(defines_in ${ARGV2}) + set(name_in ${ARGV2}) endif () if (DEFINED ARGV3) - set(name_in ${ARGV3}) + set(use_cblas ${ARGV3}) + else () + set(use_cblas 0) endif () if (DEFINED ARGV4) - set(use_cblas ${ARGV4}) - else () - set(use_cblas 0) + set(replace_last_with ${ARGV4}) endif () if (DEFINED ARGV5) - set(replace_last_with ${ARGV5}) + set(append_with ${ARGV5}) endif () if (DEFINED ARGV6) - set(append_with ${ARGV6}) + set(no_float_type ${ARGV6}) + else () + set(no_float_type false) + endif () + + if (no_float_type) + set(float_list "DUMMY") # still need to loop once + else () + set(float_list "${FLOAT_TYPES}") endif () set(OBJ_LIST_OUT "") - foreach (source_file ${sources_in}) + foreach (float_type ${float_list}) + foreach (source_file ${sources_in}) - if (DEFINED float_type_in AND NOT float_type_in STREQUAL "") - string(SUBSTRING ${float_type_in} 0 1 float_char) - string(TOLOWER ${float_char} float_char) - endif () + if (NOT no_float_type) + string(SUBSTRING ${float_type} 0 1 float_char) + string(TOLOWER ${float_char} float_char) + endif () - if (NOT name_in) - get_filename_component(source_name ${source_file} NAME_WE) - set(obj_name "${float_char}${source_name}") - else () - # replace * with float_char - if (${name_in} MATCHES "\\*") - string(REPLACE "*" ${float_char} obj_name ${name_in}) + if (NOT name_in) + get_filename_component(source_name ${source_file} NAME_WE) + set(obj_name "${float_char}${source_name}") else () - set(obj_name "${float_char}${name_in}") + # replace * with float_char + if (${name_in} MATCHES "\\*") + string(REPLACE "*" ${float_char} obj_name ${name_in}) + else () + set(obj_name "${float_char}${name_in}") + endif () endif () - endif () - if (replace_last_with) - string(REGEX REPLACE ".$" ${replace_last_with} obj_name ${obj_name}) - else () - set(obj_name "${obj_name}${append_with}") - endif () + if (replace_last_with) + string(REGEX REPLACE ".$" ${replace_last_with} obj_name ${obj_name}) + else () + set(obj_name "${obj_name}${append_with}") + endif () - # now add the object and set the defines - set(obj_defines ${defines_in}) + # now add the object and set the defines + set(obj_defines ${defines_in}) - if (use_cblas) - set(obj_name "cblas_${obj_name}") - list(APPEND obj_defines "CBLAS") - endif () + if (use_cblas) + set(obj_name "cblas_${obj_name}") + list(APPEND obj_defines "CBLAS") + endif () - list(APPEND obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CHAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") - list(APPEND obj_defines ${defines_in}) - if (NOT ${float_type_in} STREQUAL "SINGLE") - list(APPEND obj_defines ${float_type_in}) - endif () + list(APPEND obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CHAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") + list(APPEND obj_defines ${defines_in}) + if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") + list(APPEND obj_defines "DOUBLE") + endif () + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + list(APPEND obj_defines "COMPLEX") + endif () - add_library(${obj_name} OBJECT ${source_file}) - set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${obj_defines}") + add_library(${obj_name} OBJECT ${source_file}) + set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${obj_defines}") - list(APPEND OBJ_LIST_OUT ${obj_name}) + list(APPEND OBJ_LIST_OUT ${obj_name}) + endforeach () endforeach () list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) @@ -152,7 +165,6 @@ endfunction () # generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in # @param sources_in the source files to build from # @param defines_in the preprocessor definitions that will be combined to create the object files -# @param float_type_in the float type to define for this build (e.g. SINGLE/DOUBLE/etc) # @param all_defines_in (optional) preprocessor definitions that will be applied to all objects # @param replace_scheme If 1, replace the "k" in the filename with the define combo letters. E.g. symm_k.c with TRANS and UNIT defined will be symm_TU. # If 0, it will simply append the code, e.g. symm_L.c with TRANS and UNIT will be symm_LTU. @@ -160,10 +172,15 @@ endfunction () # If 3, it will insert the code *around* the last character with an underscore, e.g. symm_L.c with TRANS and UNIT will be symm_TLU (required by BLAS level2 objects). # If 4, it will insert the code before the last underscore. E.g. trtri_U_parallel with TRANS will be trtri_UT_parallel # @param alternate_name replaces the source name as the object name (define codes are still appended) -function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_type_in all_defines_in replace_scheme) +# @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc) +function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) + + if (DEFINED ARGV5) + set(alternate_name ${ARGV5}) + endif () if (DEFINED ARGV6) - set(alternate_name ${ARGV6}) + set(no_float_type ${ARGV6}) endif () AllCombinations("${defines_in}" "${absent_codes_in}") @@ -223,7 +240,7 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in float_ endif () endif () - GenerateNamedObjects("${source_file}" "${float_type_in}" "${cur_defines}" "${alternate_name}" 0 "${replace_code}" "${append_code}") + GenerateNamedObjects("${source_file}" "${cur_defines}" "${alternate_name}" 0 "${replace_code}" "${append_code}" "${no_float_type}") list(APPEND COMBO_OBJ_LIST_OUT "${OBJ_LIST_OUT}") endforeach () endforeach () diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index d8f8123d3..4524ad688 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -28,26 +28,26 @@ set(NU_SOURCES ) # objects that need LOWER set -GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "DOUBLE" "" 1) +GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "" 1) # objects that need TRANSA and UNIT set # N.B. BLAS wants to put the U/L from the filename in the *MIDDLE* because of course why not have a different naming scheme for every single object -hpa -GenerateCombinationObjects("${NU_SOURCES}" "TRANSA;UNIT" "N;N" "DOUBLE" "" 3) +GenerateCombinationObjects("${NU_SOURCES}" "TRANSA;UNIT" "N;N" "" 3) # gbmv uses a lowercase n and t. WHY? WHO KNOWS! -GenerateNamedObjects("gbmv_k.c" "DOUBLE" "" "gbmv_n") -GenerateNamedObjects("gbmv_k.c" "DOUBLE" "TRANS" "gbmv_t") +GenerateNamedObjects("gbmv_k.c" "" "gbmv_n") +GenerateNamedObjects("gbmv_k.c" "TRANS" "gbmv_t") if (SMP) # gbmv uses a lowercase n and t. N.B. this uses TRANSA where gbmv.c uses TRANS. Intentional? - GenerateNamedObjects("gbmv_thread.c" "DOUBLE" "" "gbmv_thread_n") - GenerateNamedObjects("gbmv_thread.c" "DOUBLE" "TRANSA" "gbmv_thread_t") + GenerateNamedObjects("gbmv_thread.c" "" "gbmv_thread_n") + GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t") - GenerateNamedObjects("gemv_thread.c" "DOUBLE" "" "gemv_thread_n") - GenerateNamedObjects("gemv_thread.c" "DOUBLE" "TRANSA" "gemv_thread_t") + GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n") + GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t") - GenerateNamedObjects("ger_thread.c" "DOUBLE") + GenerateNamedObjects("ger_thread.c") set(UL_SMP_SOURCES symv_thread.c @@ -59,7 +59,7 @@ if (SMP) sbmv_thread.c ) - GenerateCombinationObjects("${UL_SMP_SOURCES}" "LOWER" "U" "DOUBLE" "" 2) + GenerateCombinationObjects("${UL_SMP_SOURCES}" "LOWER" "U" "" 2) set(NU_SMP_SOURCES trmv_thread.c @@ -67,7 +67,7 @@ if (SMP) tbmv_thread.c ) - GenerateCombinationObjects("${NU_SMP_SOURCES}" "TRANSA;LOWER;UNIT" "N;U;N" "DOUBLE" "" 2) + GenerateCombinationObjects("${NU_SMP_SOURCES}" "TRANSA;LOWER;UNIT" "N;U;N" "" 2) endif () diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index b9a817323..7259a87e7 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -26,25 +26,25 @@ endif () set(GEMM_DEFINES NN NT TN TT) foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) - GenerateNamedObjects("gemm.c" "DOUBLE" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0) if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) - GenerateNamedObjects("gemm.c" "DOUBLE" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) endif () endforeach () -GenerateCombinationObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "N;L;N" "DOUBLE" "" 0) -GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "DOUBLE" "NN" 1) -GenerateCombinationObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "U;N" "DOUBLE" "" 1) -GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "U" "DOUBLE" "" 2) +GenerateCombinationObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "N;L;N" "" 0) +GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "NN" 1) +GenerateCombinationObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "U;N" "" 1) +GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "U" "" 2) if (SMP) # N.B. these do NOT have a float type (e.g. DOUBLE) defined! - GenerateNamedObjects("gemm_thread_m.c;gemm_thread_n.c;gemm_thread_mn.c;gemm_thread_variable.c;syrk_thread.c" "" "" "" 0) + GenerateNamedObjects("gemm_thread_m.c;gemm_thread_n.c;gemm_thread_mn.c;gemm_thread_variable.c;syrk_thread.c" "" "" 0 "" "" 1) if (NOT USE_SIMPLE_THREADED_LEVEL3) - GenerateCombinationObjects("syrk_k.c" "LOWER;TRANS" "U;N" "DOUBLE" "THREADED_LEVEL3" 2 "syrk_thread") - GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "DOUBLE" "THREADED_LEVEL3;NN" 2 "symm_thread") + GenerateCombinationObjects("syrk_k.c" "LOWER;TRANS" "U;N" "THREADED_LEVEL3" 2 "syrk_thread") + GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "THREADED_LEVEL3;NN" 2 "symm_thread") endif () endif () diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 7f8672eb2..3e17ce5be 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -35,10 +35,10 @@ set(COMMON_SOURCES openblas_error_handle.c ) -# these need to have NAME/CNAME set, so use GenerateNamedObjects -GenerateNamedObjects("abs.c" "" "" "c_abs") -GenerateNamedObjects("abs.c" "" "DOUBLE" "z_abs") -GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c") +# these need to have NAME/CNAME set, so use GenerateNamedObjects, but don't use standard name mangling +GenerateNamedObjects("abs.c" "" "c_abs" 0 "" "" 1 ) +GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1) +GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1) if (DYNAMIC_ARCH) list(APPEND COMMON_SOURCES dynamic.c) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 1b0ac42d6..739705d17 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -40,23 +40,23 @@ endif () foreach (CBLAS_FLAG ${CBLAS_FLAGS}) - GenerateNamedObjects("${BLAS1_SOURCES}" "DOUBLE" "" "" ${CBLAS_FLAG}) - GenerateNamedObjects("${BLAS2_SOURCES}" "DOUBLE" "" "" ${CBLAS_FLAG}) - GenerateNamedObjects("${BLAS3_SOURCES}" "DOUBLE" "" "" ${CBLAS_FLAG}) + GenerateNamedObjects("${BLAS1_SOURCES}" "" "" ${CBLAS_FLAG}) + GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG}) + GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG}) # trmm is trsm with a compiler flag set - GenerateNamedObjects("trsm.c" "DOUBLE" "TRMM" "trmm" ${CBLAS_FLAG}) + GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) # max and imax are compiled 4 times - GenerateNamedObjects("max.c" "DOUBLE" "" "" ${CBLAS_FLAG}) - GenerateNamedObjects("max.c" "DOUBLE" "USE_ABS" "amax" ${CBLAS_FLAG}) - GenerateNamedObjects("max.c" "DOUBLE" "USE_ABS;USE_MIN" "amin" ${CBLAS_FLAG}) - GenerateNamedObjects("max.c" "DOUBLE" "USE_MIN" "min" ${CBLAS_FLAG}) + GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG}) + GenerateNamedObjects("max.c" "USE_ABS" "amax" ${CBLAS_FLAG}) + GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "amin" ${CBLAS_FLAG}) + GenerateNamedObjects("max.c" "USE_MIN" "min" ${CBLAS_FLAG}) - GenerateNamedObjects("imax.c" "DOUBLE" "" "i*max" ${CBLAS_FLAG}) - GenerateNamedObjects("imax.c" "DOUBLE" "USE_ABS" "i*amax" ${CBLAS_FLAG}) - GenerateNamedObjects("imax.c" "DOUBLE" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) - GenerateNamedObjects("imax.c" "DOUBLE" "USE_MIN" "i*min" ${CBLAS_FLAG}) + GenerateNamedObjects("imax.c" "" "i*max" ${CBLAS_FLAG}) + GenerateNamedObjects("imax.c" "USE_ABS" "i*amax" ${CBLAS_FLAG}) + GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) + GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) endforeach () @@ -66,7 +66,7 @@ if (NOT DEFINED NO_LAPACK) lapack/potf2.c lapack/laswp.c lapack/gesv.c lapack/lauu2.c lapack/lauum.c lapack/trti2.c lapack/trtri.c ) - GenerateNamedObjects("${LAPACK_SOURCES}" "DOUBLE" "" "" 0) + GenerateNamedObjects("${LAPACK_SOURCES}") endif () set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index a83bd0dbe..6c259039b 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -11,9 +11,10 @@ endif () set(LSAME_KERNEL lsame.S) set(SCABS_KERNEL cabs.S) set(DCABS_KERNEL cabs.S) -GenerateNamedObjects("${KERNELDIR}/${LSAME_KERNEL}" "" "F_INTERFACE" "lsame") -GenerateNamedObjects("${KERNELDIR}/${SCABS_KERNEL}" "SINGLE" "COMPLEX;F_INTERFACE" "cabs1") -GenerateNamedObjects("${KERNELDIR}/${DCABS_KERNEL}" "DOUBLE" "COMPLEX;F_INTERFACE" "cabs1") +# don't use float type name mangling here +GenerateNamedObjects("${KERNELDIR}/${LSAME_KERNEL}" "F_INTERFACE" "lsame" 0 "" "" 1) +GenerateNamedObjects("${KERNELDIR}/${SCABS_KERNEL}" "COMPLEX;F_INTERFACE" "scabs1" "" "" 1) +GenerateNamedObjects("${KERNELDIR}/${DCABS_KERNEL}" "DOUBLE;COMPLEX;F_INTERFACE" "dcabs1" 0 "" "" 1) # Makefile.L1 @@ -37,30 +38,30 @@ set(DSCALKERNEL scal.S) set(DSWAPKERNEL swap.S) set(DAXPBYKERNEL ../arm/axpby.c) -GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "DOUBLE" "USE_ABS" "amax_k") -GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "DOUBLE" "USE_ABS;USE_MIN" "amin_k") -GenerateNamedObjects("${KERNELDIR}/${DMAXKERNEL}" "DOUBLE" "" "max_k") -GenerateNamedObjects("${KERNELDIR}/${DMINKERNEL}" "DOUBLE" "" "min_k") -GenerateNamedObjects("${KERNELDIR}/${IDAMAXKERNEL}" "DOUBLE" "USE_ABS" "i*amax_k") -GenerateNamedObjects("${KERNELDIR}/${IDAMINKERNEL}" "DOUBLE" "USE_ABS;USE_MIN" "i*amin_k") -GenerateNamedObjects("${KERNELDIR}/${IDMAXKERNEL}" "DOUBLE" "" "i*max_k") -GenerateNamedObjects("${KERNELDIR}/${IDMINKERNEL}" "DOUBLE" "" "i*min_k") -GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "DOUBLE" "" "asum_k") -GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "DOUBLE" "" "axpy_k") -GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "DOUBLE" "C_INTERFACE" "copy_k") -GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "DOUBLE" "" "dot_k") -GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "DOUBLE" "" "nrm2_k") -GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "DOUBLE" "" "rot_k") -GenerateNamedObjects("${KERNELDIR}/${DSCALKERNEL}" "DOUBLE" "" "scal_k") -GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "DOUBLE" "" "swap_k") -GenerateNamedObjects("${KERNELDIR}/${DAXPBYKERNEL}" "DOUBLE" "" "axpby_k") +GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k") +GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k") +GenerateNamedObjects("${KERNELDIR}/${DMAXKERNEL}" "" "max_k") +GenerateNamedObjects("${KERNELDIR}/${DMINKERNEL}" "" "min_k") +GenerateNamedObjects("${KERNELDIR}/${IDAMAXKERNEL}" "USE_ABS" "i*amax_k") +GenerateNamedObjects("${KERNELDIR}/${IDAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k") +GenerateNamedObjects("${KERNELDIR}/${IDMAXKERNEL}" "" "i*max_k") +GenerateNamedObjects("${KERNELDIR}/${IDMINKERNEL}" "" "i*min_k") +GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "" "asum_k") +GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k") +GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k") +GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k") +GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k") +GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k") +GenerateNamedObjects("${KERNELDIR}/${DSCALKERNEL}" "" "scal_k") +GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k") +GenerateNamedObjects("${KERNELDIR}/${DAXPBYKERNEL}" "" "axpby_k") # Makefile.L2 GenerateNamedObjects("${KERNELDIR}/gemv_n.S" "DOUBLE") -GenerateNamedObjects("${KERNELDIR}/gemv_t.S" "DOUBLE" "TRANS") -GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "DOUBLE" "" 1) -GenerateNamedObjects("generic/ger.c" "DOUBLE" "" "ger_k") +GenerateNamedObjects("${KERNELDIR}/gemv_t.S" "TRANS") +GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1) +GenerateNamedObjects("generic/ger.c" "" "ger_k") # Makefile.L3 @@ -76,78 +77,78 @@ set(DGEMMITCOPYOBJ gemm_itcopy) set(DGEMMONCOPYOBJ gemm_oncopy) set(DGEMMOTCOPYOBJ gemm_otcopy) -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "" "gemm_kernel") +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel") if (DGEMMINCOPY) - GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "DOUBLE" "" "${DGEMMINCOPYOBJ}") + GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "" "${DGEMMINCOPYOBJ}") endif () if (DGEMMITCOPY) - GenerateNamedObjects("${KERNELDIR}/${DGEMMITCOPY}" "DOUBLE" "" "${DGEMMITCOPYOBJ}") + GenerateNamedObjects("${KERNELDIR}/${DGEMMITCOPY}" "" "${DGEMMITCOPYOBJ}") endif () if (DGEMMONCOPY) - GenerateNamedObjects("${KERNELDIR}/${DGEMMONCOPY}" "DOUBLE" "" "${DGEMMONCOPYOBJ}") + GenerateNamedObjects("${KERNELDIR}/${DGEMMONCOPY}" "" "${DGEMMONCOPYOBJ}") endif () if (DGEMMOTCOPY) - GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "DOUBLE" "" "${DGEMMOTCOPYOBJ}") + GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "" "${DGEMMOTCOPYOBJ}") endif () -GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "DOUBLE") -GenerateCombinationObjects("${KERNELDIR}/${DGEMMKERNEL}" "LEFT;TRANSA" "R;N" "DOUBLE" "TRMMKERNEL" 2 "trmm_kernel") -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN") -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "LT;TRSMKERNEL" "trsm_kernel_LT") -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN") -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "DOUBLE" "RT;TRSMKERNEL" "trsm_kernel_RT") +GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}") +GenerateCombinationObjects("${KERNELDIR}/${DGEMMKERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel") +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN") +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "LT;TRSMKERNEL" "trsm_kernel_LT") +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN") +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "RT;TRSMKERNEL" "trsm_kernel_RT") # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. -GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trmm_iunucopy") -GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trmm_iunncopy") -GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trmm_ounucopy") -GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trmm_ounncopy") - -GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trmm_ilnucopy") -GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trmm_ilnncopy") -GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trmm_olnucopy") -GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trmm_olnncopy") - -GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trmm_iutucopy") -GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trmm_iutncopy") -GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trmm_outucopy") -GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trmm_outncopy") - -GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trmm_iltucopy") -GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trmm_iltncopy") -GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trmm_oltucopy") -GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trmm_oltncopy") - -GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trsm_iunucopy") -GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trsm_iunncopy") -GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trsm_ounucopy") -GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trsm_ounncopy") - -GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trsm_ilnucopy") -GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trsm_ilnncopy") -GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trsm_olnucopy") -GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trsm_olnncopy") - -GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "UNIT" "trsm_iutucopy") -GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "" "trsm_iutncopy") -GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;UNIT" "trsm_outucopy") -GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER" "trsm_outncopy") - -GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER;UNIT" "trsm_iltucopy") -GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "LOWER" "trsm_iltncopy") -GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER;UNIT" "trsm_oltucopy") -GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "DOUBLE" "OUTER;LOWER" "trsm_oltncopy") - -GenerateNamedObjects("generic/symm_ucopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "OUTER" "symm_outcopy") -GenerateNamedObjects("generic/symm_ucopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "" "symm_iutcopy") - -GenerateNamedObjects("generic/symm_lcopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "LOWER;OUTER" "symm_oltcopy") -GenerateNamedObjects("generic/symm_lcopy_${DGEMM_UNROLL_N}.c" "DOUBLE" "LOWER" "symm_iltcopy") +GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy") +GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "" "trmm_iunncopy") +GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "OUTER;UNIT" "trmm_ounucopy") +GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "OUTER" "trmm_ounncopy") + +GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy") +GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy") +GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy") +GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER" "trmm_olnncopy") + +GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy") +GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "" "trmm_iutncopy") +GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "OUTER;UNIT" "trmm_outucopy") +GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "OUTER" "trmm_outncopy") + +GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy") +GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy") +GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy") +GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER" "trmm_oltncopy") + +GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy") +GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "" "trsm_iunncopy") +GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "OUTER;UNIT" "trsm_ounucopy") +GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "OUTER" "trsm_ounncopy") + +GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy") +GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy") +GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy") +GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER" "trsm_olnncopy") + +GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy") +GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "" "trsm_iutncopy") +GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "OUTER;UNIT" "trsm_outucopy") +GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "OUTER" "trsm_outncopy") + +GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy") +GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy") +GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy") +GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER" "trsm_oltncopy") + +GenerateNamedObjects("generic/symm_ucopy_${DGEMM_UNROLL_N}.c" "OUTER" "symm_outcopy") +GenerateNamedObjects("generic/symm_ucopy_${DGEMM_UNROLL_N}.c" "" "symm_iutcopy") + +GenerateNamedObjects("generic/symm_lcopy_${DGEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy") +GenerateNamedObjects("generic/symm_lcopy_${DGEMM_UNROLL_N}.c" "LOWER" "symm_iltcopy") if (NOT DEFINED DOMATCOPY_CN) set(DOMATCOPY_CN ../arm/omatcopy_cn.c) @@ -162,10 +163,10 @@ if (NOT DEFINED DOMATCOPY_RT) set(DOMATCOPY_RT ../arm/omatcopy_rt.c) endif () -GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_CN}" "DOUBLE" "" "domatcopy_k_cn") -GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_RN}" "DOUBLE" "ROWM" "domatcopy_k_rn") -GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_CT}" "DOUBLE" "" "domatcopy_k_ct") -GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_RT}" "DOUBLE" "ROWM" "domatcopy_k_rt") +GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_CN}" "" "domatcopy_k_cn") +GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_RN}" "ROWM" "domatcopy_k_rn") +GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_CT}" "" "domatcopy_k_ct") +GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_RT}" "ROWM" "domatcopy_k_rt") # Makefile.LA #DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 346f96e34..ed598f22d 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -50,11 +50,11 @@ set(ZLAPACK_SOURCES trtri/trtri_L_single.c ) -GenerateNamedObjects("${LAPACK_SOURCES}" "DOUBLE") +GenerateNamedObjects("${LAPACK_SOURCES}") # TODO: laswp needs arch specific code -GenerateNamedObjects("laswp/generic/laswp_k.c" "DOUBLE" "" "laswp_plus") -GenerateNamedObjects("laswp/generic/laswp_k.c" "DOUBLE" "MINUS" "laswp_minus") +GenerateNamedObjects("laswp/generic/laswp_k.c" "" "laswp_plus") +GenerateNamedObjects("laswp/generic/laswp_k.c" "MINUS" "laswp_minus") if (SMP) @@ -92,12 +92,12 @@ if (SMP) trtri/trtri_L_parallel.c ) - GenerateNamedObjects("${PARALLEL_SOURCES}" "DOUBLE" "" "" 0) + GenerateNamedObjects("${PARALLEL_SOURCES}") endif () -GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "DOUBLE" "" 4) -GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "DOUBLE" "" 4) -GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "DOUBLE" "" 0) +GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "" 4) +GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) +GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0) set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From 9eb1499095c6bf523ae4024f3707fd88cad7b131 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 17 Feb 2015 10:30:28 -0600 Subject: [PATCH 077/257] Added another param to GenerateNamedObjects to mangle complex source names. There are a lot of sources for complex float types that are the same names as the real sources, except with z prepended. --- CMakeLists.txt | 2 -- cmake/utils.cmake | 31 +++++++++++++++++++++++++++++++ interface/CMakeLists.txt | 33 ++++++++++++++++++++++++++++----- 3 files changed, 59 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5dd811959..85b20b176 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,8 +92,6 @@ endforeach () # get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) set(TARGET_OBJS "") foreach (DBLAS_OBJ ${DBLAS_OBJS}) - #get_target_property(PREV_DEFS ${DBLAS_OBJ} COMPILE_DEFINITIONS) - #set_target_properties(${DBLAS_OBJ} PROPERTIES COMPILE_DEFINITIONS "${PREV_DEFS};DOUBLE") list(APPEND TARGET_OBJS "$") endforeach () diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 81083a19f..aaa669abd 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -70,6 +70,11 @@ endfunction () # @param replace_last_with replaces the last character in the filename with this string (e.g. symm_k should be symm_TU) # @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters) # @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc) +# @param complex_only/real_only some routines have separate source files for complex and non-complex float types. +# 0 - compiles for all types +# 1 - compiles the sources for non-complex types only (SINGLE/DOUBLE) +# 2 - compiles for complex types only (COMPLEX/DOUBLE COMPLEX) +# 3 - compiles for all types, but changes source names for complex by prepending z (e.g. axpy.c becomes zaxpy.c) function(GenerateNamedObjects sources_in) if (DEFINED ARGV1) @@ -100,10 +105,30 @@ function(GenerateNamedObjects sources_in) set(no_float_type false) endif () + set(real_only false) + set(complex_only false) + set(mangle_complex_sources false) + if (DEFINED ARGV7) + if (${ARGV7} EQUAL 1) + set(real_only true) + elseif (${ARGV7} EQUAL 2) + set(complex_only true) + elseif (${ARGV7} EQUAL 3) + set(mangle_complex_sources true) + endif () + endif () + if (no_float_type) set(float_list "DUMMY") # still need to loop once else () set(float_list "${FLOAT_TYPES}") + if (complex_only) + list(REMOVE_ITEM float_list "SINGLE") + list(REMOVE_ITEM float_list "DOUBLE") + elseif (real_only) + list(REMOVE_ITEM float_list "COMPLEX") + list(REMOVE_ITEM float_list "ZCOMPLEX") + endif () endif () set(OBJ_LIST_OUT "") @@ -148,6 +173,12 @@ function(GenerateNamedObjects sources_in) endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "COMPLEX") + if (mangle_complex_sources) + # add a z to the filename + get_filename_component(source_name ${source_file} NAME) + get_filename_component(source_dir ${source_file} DIRECTORY) + string(REPLACE ${source_name} "z${source_name}" source_file ${source_file}) + endif () endif () add_library(${obj_name} OBJECT ${source_file}) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 739705d17..030a14fd2 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -2,15 +2,25 @@ include_directories(${CMAKE_SOURCE_DIR}) set(BLAS1_SOURCES + copy.c + asum.c nrm2.c +) + +set(BLAS1_REAL_ONLY_SOURCES + rotm.c rotmg.c # N.B. these do not have complex counterparts +) + +# these will have 'z' prepended for the complex version +set(BLAS1_MANGLED_SOURCES axpy.c swap.c - copy.c scal.c + scal.c dot.c - asum.c nrm2.c - rot.c rotg.c rotm.c rotmg.c + rot.c rotg.c axpby.c ) # TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f +# these all have 'z' sources for complex versions set(BLAS2_SOURCES gemv.c ger.c trsv.c trmv.c symv.c @@ -24,6 +34,9 @@ set(BLAS2_SOURCES set(BLAS3_SOURCES gemm.c symm.c trsm.c syrk.c syr2k.c +) + +set(BLAS3_MANGLED_SOURCES omatcopy.c imatcopy.c ) @@ -41,8 +54,11 @@ endif () foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS1_SOURCES}" "" "" ${CBLAS_FLAG}) - GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG}) + GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" 0 1) + GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" 0 3) + GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" 0 3) GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG}) + GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" 0 3) # trmm is trsm with a compiler flag set GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) @@ -62,11 +78,18 @@ endforeach () if (NOT DEFINED NO_LAPACK) set(LAPACK_SOURCES + lapack/gesv.c + ) + + # prepend z for complex versions + set(LAPACK_MANGLED_SOURCES lapack/getrf.c lapack/getrs.c lapack/potrf.c lapack/getf2.c - lapack/potf2.c lapack/laswp.c lapack/gesv.c lapack/lauu2.c + lapack/potf2.c lapack/laswp.c lapack/lauu2.c lapack/lauum.c lapack/trti2.c lapack/trtri.c ) + GenerateNamedObjects("${LAPACK_SOURCES}") + GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) endif () set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From 67e39bd8fb797cc420b191780fa4ae3ae00792ff Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 17 Feb 2015 13:12:30 -0600 Subject: [PATCH 078/257] Added mangled complex filenames to interface and lapack CMakeLists.txt. --- cmake/utils.cmake | 13 +++++++++---- interface/CMakeLists.txt | 20 ++++++++++++++------ lapack/CMakeLists.txt | 40 ++++++++++++++++++++-------------------- 3 files changed, 43 insertions(+), 30 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index aaa669abd..d9c180fb6 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -70,7 +70,7 @@ endfunction () # @param replace_last_with replaces the last character in the filename with this string (e.g. symm_k should be symm_TU) # @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters) # @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc) -# @param complex_only/real_only some routines have separate source files for complex and non-complex float types. +# @param complex_filename_scheme some routines have separate source files for complex and non-complex float types. # 0 - compiles for all types # 1 - compiles the sources for non-complex types only (SINGLE/DOUBLE) # 2 - compiles for complex types only (COMPLEX/DOUBLE COMPLEX) @@ -88,7 +88,7 @@ function(GenerateNamedObjects sources_in) if (DEFINED ARGV3) set(use_cblas ${ARGV3}) else () - set(use_cblas 0) + set(use_cblas false) endif () if (DEFINED ARGV4) @@ -108,7 +108,7 @@ function(GenerateNamedObjects sources_in) set(real_only false) set(complex_only false) set(mangle_complex_sources false) - if (DEFINED ARGV7) + if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "") if (${ARGV7} EQUAL 1) set(real_only true) elseif (${ARGV7} EQUAL 2) @@ -204,6 +204,7 @@ endfunction () # If 4, it will insert the code before the last underscore. E.g. trtri_U_parallel with TRANS will be trtri_UT_parallel # @param alternate_name replaces the source name as the object name (define codes are still appended) # @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc) +# @param complex_filename_scheme see GenerateNamedObjects function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) if (DEFINED ARGV5) @@ -214,6 +215,10 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_de set(no_float_type ${ARGV6}) endif () + if (DEFINED ARGV7) + set(complex_filename_scheme ${ARGV7}) + endif () + AllCombinations("${defines_in}" "${absent_codes_in}") set(define_combos ${LIST_OUT}) set(define_codes ${CODES_OUT}) @@ -271,7 +276,7 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_de endif () endif () - GenerateNamedObjects("${source_file}" "${cur_defines}" "${alternate_name}" 0 "${replace_code}" "${append_code}" "${no_float_type}") + GenerateNamedObjects("${source_file}" "${cur_defines}" "${alternate_name}" 0 "${replace_code}" "${append_code}" "${no_float_type}" "${complex_filename_scheme}") list(APPEND COMBO_OBJ_LIST_OUT "${OBJ_LIST_OUT}") endforeach () endforeach () diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 030a14fd2..633b8a6fe 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -31,6 +31,7 @@ set(BLAS2_SOURCES tpsv.c tpmv.c ) +# these do not have separate 'z' sources set(BLAS3_SOURCES gemm.c symm.c trsm.c syrk.c syr2k.c @@ -53,12 +54,19 @@ endif () foreach (CBLAS_FLAG ${CBLAS_FLAGS}) - GenerateNamedObjects("${BLAS1_SOURCES}" "" "" ${CBLAS_FLAG}) - GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" 0 1) - GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" 0 3) - GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" 0 3) - GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG}) - GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" 0 3) + # TODO: don't compile complex sources with cblas for now, the naming schemes are all different and they will have to be handled separately from SINGLE/DOUBLE + set(DISABLE_COMPLEX 0) + set(MANGLE_COMPLEX 3) + if (CBLAS_FLAG EQUAL 1) + set(DISABLE_COMPLEX 1) + set(MANGLE_COMPLEX 1) + endif () + GenerateNamedObjects("${BLAS1_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) + GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) + GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) + GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) # trmm is trsm with a compiler flag set GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index ed598f22d..26922f50e 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -3,29 +3,36 @@ include_directories(${CMAKE_SOURCE_DIR}) set(LAPACK_SOURCES - getf2/getf2_k.c getrf/getrf_single.c potrf/potrf_U_single.c potrf/potrf_L_single.c - potf2/potf2_U.c - potf2/potf2_L.c - lauu2/lauu2_U.c - lauu2/lauu2_L.c lauum/lauum_U_single.c lauum/lauum_L_single.c ) +# add a 'z' to filename for complex version +set(LAPACK_MANGLED_SOURCES + getf2/getf2_k.c + lauu2/lauu2_U.c + lauu2/lauu2_L.c + potf2/potf2_U.c + potf2/potf2_L.c +) + # sources that need TRANS set +# this has a 'z' version set(TRANS_SOURCES getrs/getrs_single.c ) # sources that need UNIT set +# these do NOT have a z version set(UNIT_SOURCES trtri/trtri_U_single.c trtri/trtri_L_single.c ) +# these have a 'z' version set(UNIT_SOURCES2 trti2/trti2_U.c trti2/trti2_L.c @@ -51,6 +58,7 @@ set(ZLAPACK_SOURCES ) GenerateNamedObjects("${LAPACK_SOURCES}") +GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" "" 3) # TODO: laswp needs arch specific code GenerateNamedObjects("laswp/generic/laswp_k.c" "" "laswp_plus") @@ -64,40 +72,32 @@ if (SMP) set(GETRF_SRC getrf/getrf_parallel.c) endif () + # these do not have 'z' versions set(PARALLEL_SOURCES ${GETRF_SRC} - potrf/potrf_U_parallel.c - potrf/potrf_L_parallel.c lauum/lauum_U_parallel.c lauum/lauum_L_parallel.c + potrf/potrf_U_parallel.c + potrf/potrf_L_parallel.c ) + # this has a z version list(APPEND TRANS_SOURCES getrs/getrs_parallel.c ) + # these do NOT have a z version list(APPEND UNIT_SOURCES trtri/trtri_U_parallel.c trtri/trtri_L_parallel.c ) - set(ZPARALLEL_SOURCES - ${GETRF_SRC} - getrs/zgetrs_parallel.c - potrf/potrf_U_parallel.c - potrf/potrf_L_parallel.c - lauum/lauum_U_parallel.c - lauum/lauum_L_parallel.c - trtri/trtri_U_parallel.c - trtri/trtri_L_parallel.c - ) - GenerateNamedObjects("${PARALLEL_SOURCES}") endif () -GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "" 4) +GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "" 4 "" "" 3) GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) -GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0) +GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3) set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From 33c5e8db7f60035c748dc1aac370775025936a34 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 17 Feb 2015 21:36:23 -0600 Subject: [PATCH 079/257] Added a helper function for setting the L1 kernel defaults. Added loop to build objects with different KERNEL defines. --- cmake/kernel.cmake | 110 +++++++++++++++++++++++++++++++++++ cmake/utils.cmake | 47 +++++++++++---- driver/level2/CMakeLists.txt | 8 +-- kernel/CMakeLists.txt | 93 ++++++++++++++--------------- 4 files changed, 197 insertions(+), 61 deletions(-) create mode 100644 cmake/kernel.cmake diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake new file mode 100644 index 000000000..211da229d --- /dev/null +++ b/cmake/kernel.cmake @@ -0,0 +1,110 @@ +# helper functions for the kernel CMakeLists.txt + + +# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. +macro(SetDefaultL1) + set(SAMAXKERNEL amax.S) + set(DAMAXKERNEL amax.S) + set(QAMAXKERNEL amax.S) + set(CAMAXKERNEL zamax.S) + set(ZAMAXKERNEL zamax.S) + set(XAMAXKERNEL zamax.S) + set(SAMINKERNEL amin.S) + set(DAMINKERNEL amin.S) + set(QAMINKERNEL amin.S) + set(CAMINKERNEL zamin.S) + set(ZAMINKERNEL zamin.S) + set(XAMINKERNEL zamin.S) + set(SMAXKERNEL max.S) + set(DMAXKERNEL max.S) + set(QMAXKERNEL max.S) + set(SMINKERNEL min.S) + set(DMINKERNEL min.S) + set(QMINKERNEL min.S) + set(ISAMAXKERNEL iamax.S) + set(IDAMAXKERNEL iamax.S) + set(IQAMAXKERNEL iamax.S) + set(ICAMAXKERNEL izamax.S) + set(IZAMAXKERNEL izamax.S) + set(IXAMAXKERNEL izamax.S) + set(ISAMINKERNEL iamin.S) + set(IDAMINKERNEL iamin.S) + set(IQAMINKERNEL iamin.S) + set(ICAMINKERNEL izamin.S) + set(IZAMINKERNEL izamin.S) + set(IXAMINKERNEL izamin.S) + set(ISMAXKERNEL iamax.S) + set(IDMAXKERNEL iamax.S) + set(IQMAXKERNEL iamax.S) + set(ISMINKERNEL iamin.S) + set(IDMINKERNEL iamin.S) + set(IQMINKERNEL iamin.S) + set(SASUMKERNEL asum.S) + set(DASUMKERNEL asum.S) + set(CASUMKERNEL zasum.S) + set(ZASUMKERNEL zasum.S) + set(QASUMKERNEL asum.S) + set(XASUMKERNEL zasum.S) + set(SAXPYKERNEL axpy.S) + set(DAXPYKERNEL axpy.S) + set(CAXPYKERNEL zaxpy.S) + set(ZAXPYKERNEL zaxpy.S) + set(QAXPYKERNEL axpy.S) + set(XAXPYKERNEL zaxpy.S) + set(SCOPYKERNEL copy.S) + set(DCOPYKERNEL copy.S) + set(CCOPYKERNEL zcopy.S) + set(ZCOPYKERNEL zcopy.S) + set(QCOPYKERNEL copy.S) + set(XCOPYKERNEL zcopy.S) + set(SDOTKERNEL dot.S) + set(DDOTKERNEL dot.S) + set(CDOTKERNEL zdot.S) + set(ZDOTKERNEL zdot.S) + set(QDOTKERNEL dot.S) + set(XDOTKERNEL zdot.S) + set(SNRM2KERNEL nrm2.S) + set(DNRM2KERNEL nrm2.S) + set(QNRM2KERNEL nrm2.S) + set(CNRM2KERNEL znrm2.S) + set(ZNRM2KERNEL znrm2.S) + set(XNRM2KERNEL znrm2.S) + set(SROTKERNEL rot.S) + set(DROTKERNEL rot.S) + set(QROTKERNEL rot.S) + set(CROTKERNEL zrot.S) + set(ZROTKERNEL zrot.S) + set(XROTKERNEL zrot.S) + set(SSCALKERNEL scal.S) + set(DSCALKERNEL scal.S) + set(CSCALKERNEL zscal.S) + set(ZSCALKERNEL zscal.S) + set(QSCALKERNEL scal.S) + set(XSCALKERNEL zscal.S) + set(SSWAPKERNEL swap.S) + set(DSWAPKERNEL swap.S) + set(CSWAPKERNEL zswap.S) + set(ZSWAPKERNEL zswap.S) + set(QSWAPKERNEL swap.S) + set(XSWAPKERNEL zswap.S) + set(SGEMVNKERNEL gemv_n.S) + set(SGEMVTKERNEL gemv_t.S) + set(DGEMVNKERNEL gemv_n.S) + set(DGEMVTKERNEL gemv_t.S) + set(CGEMVNKERNEL zgemv_n.S) + set(CGEMVTKERNEL zgemv_t.S) + set(ZGEMVNKERNEL zgemv_n.S) + set(ZGEMVTKERNEL zgemv_t.S) + set(QGEMVNKERNEL gemv_n.S) + set(QGEMVTKERNEL gemv_t.S) + set(XGEMVNKERNEL zgemv_n.S) + set(XGEMVTKERNEL zgemv_t.S) + set(SCABS_KERNEL cabs.S) + set(DCABS_KERNEL cabs.S) + set(QCABS_KERNEL cabs.S) + set(LSAME_KERNEL lsame.S) + set(SAXPBYKERNEL ../arm/axpby.c) + set(DAXPBYKERNEL ../arm/axpby.c) + set(CAXPBYKERNEL ../arm/zaxpby.c) + set(ZAXPBYKERNEL ../arm/zaxpby.c) +endmacro () diff --git a/cmake/utils.cmake b/cmake/utils.cmake index d9c180fb6..9635b210c 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -12,6 +12,27 @@ function(ParseGetArchVars GETARCH_IN) endforeach () endfunction () +# Reads a Makefile into CMake vars. +# TODO: read nested Makefiles (I think 1 level should do) +# TODO: respect IFDEF/IFNDEF? +# TODO: regex replace makefile vars, e.g. $(TSUFFIX) is set to the target arch in the var CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +# TODO: bail when makefile is missing, like -include +function(ParseMakefileVars MAKEFILE_IN) + message(STATUS "Reading vars from ${MAKEFILE_IN}...") + file(STRINGS ${MAKEFILE_IN} makefile_contents) + foreach (makefile_line ${makefile_contents}) + string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + set(var_name ${CMAKE_MATCH_1}) + set(var_value ${CMAKE_MATCH_2}) + set(${VAR_NAME} ${VAR_VALUE} PARENT_SCOPE) + message(STATUS "found var ${var_name} = ${var_value}") + else () + message(STATUS "couldn't parse ${makefile_line} into a var") + endif () + endforeach () +endfunction () + # Returns all combinations of the input list, as a list with colon-separated combinations # E.g. input of A B C returns A B C A:B A:C B:C # N.B. The input is meant to be a list, and to past a list to a function in CMake you must quote it (e.g. AllCombinations("${LIST_VAR}")). @@ -75,6 +96,7 @@ endfunction () # 1 - compiles the sources for non-complex types only (SINGLE/DOUBLE) # 2 - compiles for complex types only (COMPLEX/DOUBLE COMPLEX) # 3 - compiles for all types, but changes source names for complex by prepending z (e.g. axpy.c becomes zaxpy.c) +# STRING - compiles only the given type (e.g. DOUBLE) function(GenerateNamedObjects sources_in) if (DEFINED ARGV1) @@ -105,6 +127,12 @@ function(GenerateNamedObjects sources_in) set(no_float_type false) endif () + if (no_float_type) + set(float_list "DUMMY") # still need to loop once + else () + set(float_list "${FLOAT_TYPES}") + endif () + set(real_only false) set(complex_only false) set(mangle_complex_sources false) @@ -115,20 +143,17 @@ function(GenerateNamedObjects sources_in) set(complex_only true) elseif (${ARGV7} EQUAL 3) set(mangle_complex_sources true) + elseif (NOT ${ARGV7} EQUAL 0) + set(float_list ${ARGV7}) endif () endif () - if (no_float_type) - set(float_list "DUMMY") # still need to loop once - else () - set(float_list "${FLOAT_TYPES}") - if (complex_only) - list(REMOVE_ITEM float_list "SINGLE") - list(REMOVE_ITEM float_list "DOUBLE") - elseif (real_only) - list(REMOVE_ITEM float_list "COMPLEX") - list(REMOVE_ITEM float_list "ZCOMPLEX") - endif () + if (complex_only) + list(REMOVE_ITEM float_list "SINGLE") + list(REMOVE_ITEM float_list "DOUBLE") + elseif (real_only) + list(REMOVE_ITEM float_list "COMPLEX") + list(REMOVE_ITEM float_list "ZCOMPLEX") endif () set(OBJ_LIST_OUT "") diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 4524ad688..a1685dbd6 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -28,15 +28,15 @@ set(NU_SOURCES ) # objects that need LOWER set -GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "" 1) +GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "" 1 "" "" 3) # objects that need TRANSA and UNIT set # N.B. BLAS wants to put the U/L from the filename in the *MIDDLE* because of course why not have a different naming scheme for every single object -hpa -GenerateCombinationObjects("${NU_SOURCES}" "TRANSA;UNIT" "N;N" "" 3) +GenerateCombinationObjects("${NU_SOURCES}" "TRANSA;UNIT" "N;N" "" 3 "" "" 3) # gbmv uses a lowercase n and t. WHY? WHO KNOWS! -GenerateNamedObjects("gbmv_k.c" "" "gbmv_n") -GenerateNamedObjects("gbmv_k.c" "TRANS" "gbmv_t") +GenerateNamedObjects("gbmv_k.c" "" "gbmv_n" false "" "" "" 3) +GenerateNamedObjects("gbmv_k.c" "TRANS" "gbmv_t" false "" "" "" 3) if (SMP) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 6c259039b..479b1838f 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1,67 +1,68 @@ include_directories(${CMAKE_SOURCE_DIR}) +include("${CMAKE_SOURCE_DIR}/cmake/kernel.cmake") # Makeflie +if (DEFINED TARGET_CORE) + #override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + set(BUILD_KERNEL 1) + set(KDIR "") + set(TSUFFIX "_${TARGET_CORE}") +else () + set(TARGET_CORE ${CORE}) + set(KDIR "") + set(TSUFFIX "") +endif () + +SetDefaultL1() +#-include $(KERNELDIR)/KERNEL.$(TARGET_CORE) +#include $(KERNELDIR)/KERNEL +ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") +ParseMakefileVars("${KERNELDIR}/KERNEL") + if (${ARCH} STREQUAL "x86") GenerateNamedObjects("${KERNELDIR}/cpuid.S" "") endif () -# TODO: Read from ${KERNELDIR}/KERNEL - some architectures use a different lsame -set(LSAME_KERNEL lsame.S) -set(SCABS_KERNEL cabs.S) -set(DCABS_KERNEL cabs.S) # don't use float type name mangling here -GenerateNamedObjects("${KERNELDIR}/${LSAME_KERNEL}" "F_INTERFACE" "lsame" 0 "" "" 1) -GenerateNamedObjects("${KERNELDIR}/${SCABS_KERNEL}" "COMPLEX;F_INTERFACE" "scabs1" "" "" 1) -GenerateNamedObjects("${KERNELDIR}/${DCABS_KERNEL}" "DOUBLE;COMPLEX;F_INTERFACE" "dcabs1" 0 "" "" 1) - +GenerateNamedObjects("${KERNELDIR}/${LSAME_KERNEL}" "F_INTERFACE" "lsame" false "" "" true) +GenerateNamedObjects("${KERNELDIR}/${SCABS_KERNEL}" "COMPLEX;F_INTERFACE" "scabs1" false "" "" true) +GenerateNamedObjects("${KERNELDIR}/${DCABS_KERNEL}" "DOUBLE;COMPLEX;F_INTERFACE" "dcabs1" false "" "" true) # Makefile.L1 # TODO: need to read ${KERNELDIR}/KERNEL into CMake vars -set(DAMAXKERNEL amax.S) -set(DAMINKERNEL amax.S) -set(DMAXKERNEL amax.S) -set(DMINKERNEL amax.S) -set(IDAMAXKERNEL iamax.S) -set(IDAMINKERNEL iamax.S) -set(IDMAXKERNEL iamax.S) -set(IDMINKERNEL iamax.S) -set(DASUMKERNEL asum.S) -set(DAXPYKERNEL axpy.S) -set(DCOPYKERNEL copy.S) -set(DDOTKERNEL dot.S) -set(DNRM2KERNEL nrm2.S) -set(DROTKERNEL rot.S) -set(DSCALKERNEL scal.S) -set(DSWAPKERNEL swap.S) -set(DAXPBYKERNEL ../arm/axpby.c) - -GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k") -GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k") -GenerateNamedObjects("${KERNELDIR}/${DMAXKERNEL}" "" "max_k") -GenerateNamedObjects("${KERNELDIR}/${DMINKERNEL}" "" "min_k") -GenerateNamedObjects("${KERNELDIR}/${IDAMAXKERNEL}" "USE_ABS" "i*amax_k") -GenerateNamedObjects("${KERNELDIR}/${IDAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k") -GenerateNamedObjects("${KERNELDIR}/${IDMAXKERNEL}" "" "i*max_k") -GenerateNamedObjects("${KERNELDIR}/${IDMINKERNEL}" "" "i*min_k") -GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "" "asum_k") -GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k") -GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k") -GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k") -GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k") -GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k") -GenerateNamedObjects("${KERNELDIR}/${DSCALKERNEL}" "" "scal_k") -GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k") -GenerateNamedObjects("${KERNELDIR}/${DAXPBYKERNEL}" "" "axpby_k") + +foreach (float_type ${FLOAT_TYPES}) + + # a bit of metaprogramming here to pull out the appropriate KERNEL var + string(SUBSTRING ${float_type} 0 1 float_char) + GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}MAXKERNEL}" "" "max_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "" "min_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}MAXKERNEL}" "" "i*max_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "" "i*min_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}ASUMKERNEL}" "" "asum_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "" "axpy_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "" "dot_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) +endforeach () # Makefile.L2 GenerateNamedObjects("${KERNELDIR}/gemv_n.S" "DOUBLE") GenerateNamedObjects("${KERNELDIR}/gemv_t.S" "TRANS") -GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1) -GenerateNamedObjects("generic/ger.c" "" "ger_k") +GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) +GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) # Makefile.L3 @@ -77,7 +78,7 @@ set(DGEMMITCOPYOBJ gemm_itcopy) set(DGEMMONCOPYOBJ gemm_oncopy) set(DGEMMOTCOPYOBJ gemm_otcopy) -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel") +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" "" 3) if (DGEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "" "${DGEMMINCOPYOBJ}") From cebc07cebde00616b9b3facdb4cbe21f3aba3847 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 17 Feb 2015 22:09:41 -0600 Subject: [PATCH 080/257] ParseMakefileVars now recursively parses included makefiles. --- cmake/utils.cmake | 15 +++++++++------ kernel/CMakeLists.txt | 23 +++-------------------- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 9635b210c..fbb546dbe 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -13,11 +13,9 @@ function(ParseGetArchVars GETARCH_IN) endfunction () # Reads a Makefile into CMake vars. -# TODO: read nested Makefiles (I think 1 level should do) # TODO: respect IFDEF/IFNDEF? # TODO: regex replace makefile vars, e.g. $(TSUFFIX) is set to the target arch in the var CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -# TODO: bail when makefile is missing, like -include -function(ParseMakefileVars MAKEFILE_IN) +macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") file(STRINGS ${MAKEFILE_IN} makefile_contents) foreach (makefile_line ${makefile_contents}) @@ -25,13 +23,18 @@ function(ParseMakefileVars MAKEFILE_IN) if (NOT "${line_match}" STREQUAL "") set(var_name ${CMAKE_MATCH_1}) set(var_value ${CMAKE_MATCH_2}) - set(${VAR_NAME} ${VAR_VALUE} PARENT_SCOPE) + set(${var_name} ${var_value}) message(STATUS "found var ${var_name} = ${var_value}") else () - message(STATUS "couldn't parse ${makefile_line} into a var") + string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) + else () + message(STATUS "couldn't parse ${makefile_line} into a var") + endif () endif () endforeach () -endfunction () +endmacro () # Returns all combinations of the input list, as a list with colon-separated combinations # E.g. input of A B C returns A B C A:B A:C B:C diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 479b1838f..12c27fd50 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -2,7 +2,7 @@ include_directories(${CMAKE_SOURCE_DIR}) include("${CMAKE_SOURCE_DIR}/cmake/kernel.cmake") -# Makeflie +# Makefile if (DEFINED TARGET_CORE) #override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) @@ -31,11 +31,7 @@ GenerateNamedObjects("${KERNELDIR}/${SCABS_KERNEL}" "COMPLEX;F_INTERFACE" "scabs GenerateNamedObjects("${KERNELDIR}/${DCABS_KERNEL}" "DOUBLE;COMPLEX;F_INTERFACE" "dcabs1" false "" "" true) # Makefile.L1 - -# TODO: need to read ${KERNELDIR}/KERNEL into CMake vars - foreach (float_type ${FLOAT_TYPES}) - # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) @@ -58,27 +54,14 @@ foreach (float_type ${FLOAT_TYPES}) endforeach () # Makefile.L2 - GenerateNamedObjects("${KERNELDIR}/gemv_n.S" "DOUBLE") GenerateNamedObjects("${KERNELDIR}/gemv_t.S" "TRANS") GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) # Makefile.L3 - -# TODO: these are from KERNEL.PENRYN - they should be read in from the appropriate ${KERNELDIR}/KERNEL file -set(DGEMM_BETA ../generic/gemm_beta.c) -set(DGEMMKERNEL gemm_kernel_2x4_penryn.S) -set(DGEMMINCOPY gemm_ncopy_2.S) -set(DGEMMITCOPY gemm_tcopy_2.S) -set(DGEMMONCOPY ../generic/gemm_ncopy_4.c) -set(DGEMMOTCOPY ../generic/gemm_tcopy_4.c) -set(DGEMMINCOPYOBJ gemm_incopy) -set(DGEMMITCOPYOBJ gemm_itcopy) -set(DGEMMONCOPYOBJ gemm_oncopy) -set(DGEMMOTCOPYOBJ gemm_otcopy) - -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" "" 3) +message(STATUS "dgemm: ${DGEMMKERNEL}") +GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" false 3) if (DGEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "" "${DGEMMINCOPYOBJ}") From 14fd3d35de9b077d771782e35a1de89d1cb9a615 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 18 Feb 2015 10:25:01 -0600 Subject: [PATCH 081/257] Added checks for missing defines in kernel. --- kernel/CMakeLists.txt | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 12c27fd50..9f07157e3 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -36,12 +36,20 @@ foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}MAXKERNEL}" "" "max_k" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "" "min_k" false "" "" false ${float_type}) + if (DEFINED ${float_char}MAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${${float_char}MAXKERNEL}" "" "max_k" false "" "" false ${float_type}) + endif () + if (DEFINED ${float_char}MINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "" "min_k" false "" "" false ${float_type}) + endif () GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${I${float_char}MAXKERNEL}" "" "i*max_k" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "" "i*min_k" false "" "" false ${float_type}) + if (DEFINED I${float_char}MAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}MAXKERNEL}" "" "i*max_k" false "" "" false ${float_type}) + endif () + if (DEFINED I${float_char}MINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "" "i*min_k" false "" "" false ${float_type}) + endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}ASUMKERNEL}" "" "asum_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "" "axpy_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) @@ -60,8 +68,10 @@ GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) # Makefile.L3 -message(STATUS "dgemm: ${DGEMMKERNEL}") -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" false 3) +foreach (float_type ${FLOAT_TYPES}) + string(SUBSTRING ${float_type} 0 1 float_char) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) +endforeach () if (DGEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "" "${DGEMMINCOPYOBJ}") From 43725b82c5fa459ecc0ec98d21cee4c751cd33fd Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 18 Feb 2015 12:23:17 -0600 Subject: [PATCH 082/257] ParseMakefileVars now replaces Makefile vars with CMake vars. --- cmake/utils.cmake | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index fbb546dbe..c77b762e6 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -14,7 +14,6 @@ endfunction () # Reads a Makefile into CMake vars. # TODO: respect IFDEF/IFNDEF? -# TODO: regex replace makefile vars, e.g. $(TSUFFIX) is set to the target arch in the var CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") file(STRINGS ${MAKEFILE_IN} makefile_contents) @@ -23,14 +22,19 @@ macro(ParseMakefileVars MAKEFILE_IN) if (NOT "${line_match}" STREQUAL "") set(var_name ${CMAKE_MATCH_1}) set(var_value ${CMAKE_MATCH_2}) + # check for Makefile variables in the string, e.g. $(TSUFFIX) + string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) + foreach (make_var ${make_var_matches}) + # strip out Makefile $() markup + string(REGEX REPLACE "\\$\\(([0-9_a-zA-Z]+)\\)" "\\1" make_var ${make_var}) + # now replace the instance of the Makefile variable with the value of the CMake variable (note the double quote) + string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value}) + endforeach () set(${var_name} ${var_value}) - message(STATUS "found var ${var_name} = ${var_value}") else () string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) - else () - message(STATUS "couldn't parse ${makefile_line} into a var") endif () endif () endforeach () @@ -106,8 +110,10 @@ function(GenerateNamedObjects sources_in) set(defines_in ${ARGV1}) endif () - if (DEFINED ARGV2) + if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "") set(name_in ${ARGV2}) + # strip off extension for kernel files that pass in the object name. + get_filename_component(name_in ${name_in} NAME_WE) endif () if (DEFINED ARGV3) From 94922980486c297c919cfe44fc96a52a3557c1f6 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 18 Feb 2015 13:01:05 -0600 Subject: [PATCH 083/257] Added other float types to Makefile.L3. --- kernel/CMakeLists.txt | 184 +++++++++++++++++++++++------------------- 1 file changed, 100 insertions(+), 84 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 9f07157e3..be86094f1 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -71,96 +71,112 @@ GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) -endforeach () -if (DGEMMINCOPY) - GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "" "${DGEMMINCOPYOBJ}") -endif () + if (${float_char}GEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "" "${${float_char}GEMMINCOPYOBJ}" false "" "" false ${float_type}) + endif () -if (DGEMMITCOPY) - GenerateNamedObjects("${KERNELDIR}/${DGEMMITCOPY}" "" "${DGEMMITCOPYOBJ}") -endif () + if (${float_char}GEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMITCOPY}" "" "${${float_char}GEMMITCOPYOBJ}" false "" "" false ${float_type}) + endif () -if (DGEMMONCOPY) - GenerateNamedObjects("${KERNELDIR}/${DGEMMONCOPY}" "" "${DGEMMONCOPYOBJ}") -endif () + if (${float_char}GEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMONCOPY}" "" "${${float_char}GEMMONCOPYOBJ}" false "" "" false ${float_type}) + endif () -if (DGEMMOTCOPY) - GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "" "${DGEMMOTCOPYOBJ}") -endif () + if (${float_char}GEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMOTCOPY}" "" "${${float_char}GEMMOTCOPYOBJ}" false "" "" false ${float_type}) + endif () -GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}") -GenerateCombinationObjects("${KERNELDIR}/${DGEMMKERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel") -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN") -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "LT;TRSMKERNEL" "trsm_kernel_LT") -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN") -GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "RT;TRSMKERNEL" "trsm_kernel_RT") - -# These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. -# Could simplify it a bit by pairing up by -UUNIT/-DUNIT. -GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy") -GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "" "trmm_iunncopy") -GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "OUTER;UNIT" "trmm_ounucopy") -GenerateNamedObjects("generic/trmm_uncopy_${DGEMM_UNROLL_M}.c" "OUTER" "trmm_ounncopy") - -GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy") -GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy") -GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy") -GenerateNamedObjects("generic/trmm_lncopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER" "trmm_olnncopy") - -GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy") -GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "" "trmm_iutncopy") -GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "OUTER;UNIT" "trmm_outucopy") -GenerateNamedObjects("generic/trmm_utcopy_${DGEMM_UNROLL_M}.c" "OUTER" "trmm_outncopy") - -GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy") -GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy") -GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy") -GenerateNamedObjects("generic/trmm_ltcopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER" "trmm_oltncopy") - -GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy") -GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "" "trsm_iunncopy") -GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "OUTER;UNIT" "trsm_ounucopy") -GenerateNamedObjects("generic/trsm_uncopy_${DGEMM_UNROLL_M}.c" "OUTER" "trsm_ounncopy") - -GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy") -GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy") -GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy") -GenerateNamedObjects("generic/trsm_lncopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER" "trsm_olnncopy") - -GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy") -GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "" "trsm_iutncopy") -GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "OUTER;UNIT" "trsm_outucopy") -GenerateNamedObjects("generic/trsm_utcopy_${DGEMM_UNROLL_M}.c" "OUTER" "trsm_outncopy") - -GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy") -GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy") -GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy") -GenerateNamedObjects("generic/trsm_ltcopy_${DGEMM_UNROLL_M}.c" "OUTER;LOWER" "trsm_oltncopy") - -GenerateNamedObjects("generic/symm_ucopy_${DGEMM_UNROLL_N}.c" "OUTER" "symm_outcopy") -GenerateNamedObjects("generic/symm_ucopy_${DGEMM_UNROLL_N}.c" "" "symm_iutcopy") - -GenerateNamedObjects("generic/symm_lcopy_${DGEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy") -GenerateNamedObjects("generic/symm_lcopy_${DGEMM_UNROLL_N}.c" "LOWER" "symm_iltcopy") - -if (NOT DEFINED DOMATCOPY_CN) - set(DOMATCOPY_CN ../arm/omatcopy_cn.c) -endif () -if (NOT DEFINED DOMATCOPY_RN) - set(DOMATCOPY_RN ../arm/omatcopy_rn.c) -endif () -if (NOT DEFINED DOMATCOPY_CT) - set(DOMATCOPY_CT ../arm/omatcopy_ct.c) -endif () -if (NOT DEFINED DOMATCOPY_RT) - set(DOMATCOPY_RT ../arm/omatcopy_rt.c) -endif () + GenerateCombinationObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) + + # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. + # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + + if (NOT DEFINED ${float_char}OMATCOPY_CN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_CN ../arm/zomatcopy_cn.c) + else () + set(${float_char}OMATCOPY_CN ../arm/omatcopy_cn.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_RN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_RN ../arm/zomatcopy_rn.c) + else () + set(${float_char}OMATCOPY_RN ../arm/omatcopy_rn.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_CT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_CT ../arm/zomatcopy_ct.c) + else () + set(${float_char}OMATCOPY_CT ../arm/omatcopy_ct.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_RT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_RT ../arm/zomatcopy_rt.c) + else () + set(${float_char}OMATCOPY_RT ../arm/omatcopy_rt.c) + endif () + endif () -GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_CN}" "" "domatcopy_k_cn") -GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_RN}" "ROWM" "domatcopy_k_rn") -GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_CT}" "" "domatcopy_k_ct") -GenerateNamedObjects("${KERNELDIR}/${DOMATCOPY_RT}" "ROWM" "domatcopy_k_rt") + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CN}" "" "domatcopy_k_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RN}" "ROWM" "domatcopy_k_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CT}" "" "domatcopy_k_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RT}" "ROWM" "domatcopy_k_rt" false "" "" false ${float_type}) + +endforeach () # Makefile.LA #DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) From f3f2b3d76836e5c2758be7053c7e4abbc3fac311 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 19 Feb 2015 12:26:11 -0600 Subject: [PATCH 084/257] Added complex and single netlib-lapack fortran sources to lapack.cmake. --- cmake/lapack.cmake | 409 +++++++++++++++++++++++++++++++++--------- kernel/CMakeLists.txt | 1 + lapack/CMakeLists.txt | 6 +- 3 files changed, 332 insertions(+), 84 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 14581a9b2..e8d19f10d 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -1,96 +1,343 @@ # Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. set(ALLAUX - ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f - ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f + ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f + ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f ../INSTALL/ilaver.f ../INSTALL/slamch.f ) +set(SCLAUX + sbdsdc.f + sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f + slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f + slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f + slagts.f slamrg.f slanst.f + slapy2.f slapy3.f slarnv.f + slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f + slarrk.f slarrr.f slaneg.f + slartg.f slaruv.f slas2.f slascl.f + slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f + slasd7.f slasd8.f slasda.f slasdq.f slasdt.f + slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f + slasr.f slasrt.f slassq.f slasv2.f spttrf.f sstebz.f sstedc.f + ssteqr.f ssterf.f slaisnan.f sisnan.f + slartgp.f slartgs.f + ../INSTALL/second_${TIMER}.f +) + set(DZLAUX - dbdsdc.f - dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f - dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f - dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f - dlagts.f dlamrg.f dlanst.f - dlapy2.f dlapy3.f dlarnv.f - dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f - dlarrk.f dlarrr.f dlaneg.f - dlartg.f dlaruv.f dlas2.f dlascl.f - dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f - dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f - dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f - dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f - dsteqr.f dsterf.f dlaisnan.f disnan.f - dlartgp.f dlartgs.f - ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f + dbdsdc.f + dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f + dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f + dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f + dlagts.f dlamrg.f dlanst.f + dlapy2.f dlapy3.f dlarnv.f + dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f + dlarrk.f dlarrr.f dlaneg.f + dlartg.f dlaruv.f dlas2.f dlascl.f + dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f + dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f + dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f + dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f + dsteqr.f dsterf.f dlaisnan.f disnan.f + dlartgp.f dlartgs.f + ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f +) + +set(SLASRC + sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f + sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f + sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f + sgegs.f sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f + sgels.f sgelsd.f sgelss.f sgelsx.f sgelsy.f sgeql2.f sgeqlf.f + sgeqp3.f sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f + sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f + sgetc2.f sgetri.f + sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f + sggglm.f sgghrd.f sgglse.f sggqrf.f + sggrqf.f sggsvd.f sggsvp.f sgtcon.f sgtrfs.f sgtsv.f + sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f + shsein.f shseqr.f slabrd.f slacon.f slacn2.f + slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f + slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f + slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f + slansy.f slantb.f slantp.f slantr.f slanv2.f + slapll.f slapmt.f + slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f + slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f + slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f + slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slargv.f + slarrv.f slartv.f + slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f + slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f slatzm.f + sopgtr.f sopmtr.f sorg2l.f sorg2r.f + sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f + sorgrq.f sorgtr.f sorm2l.f sorm2r.f + sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f + sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f + spbstf.f spbsv.f spbsvx.f + spbtf2.f spbtrf.f spbtrs.f spocon.f spoequ.f sporfs.f sposv.f + sposvx.f spstrf.f spstf2.f + sppcon.f sppequ.f + spprfs.f sppsv.f sppsvx.f spptrf.f spptri.f spptrs.f sptcon.f + spteqr.f sptrfs.f sptsv.f sptsvx.f spttrs.f sptts2.f srscl.f + ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f + ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f + sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f + ssptrf.f ssptri.f ssptrs.f sstegr.f sstein.f sstev.f sstevd.f sstevr.f + sstevx.f + ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f + ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f + ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f + ssyswapr.f ssytrs.f ssytrs2.f ssyconv.f + ssytf2_rook.f ssytrf_rook.f ssytrs_rook.f + ssytri_rook.f ssycon_rook.f ssysv_rook.f + stbcon.f + stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f + stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f + stptrs.f + strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f + strtrs.f stzrqf.f stzrzf.f sstemr.f + slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f + stfttr.f stpttf.f stpttr.f strttf.f strttp.f + sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f + sgeequb.f ssyequb.f spoequb.f sgbequb.f + sbbcsd.f slapmr.f sorbdb.f sorbdb1.f sorbdb2.f sorbdb3.f sorbdb4.f + sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f + sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f + stpqrt.f stpqrt2.f stpmqrt.f stprfb.f spotri.f +) + +set(DSLASRC spotrs.f) + +set(CLASRC + cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f + cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f + cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f + cgegs.f cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f + cgels.f cgelsd.f cgelss.f cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f + cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f + cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f + cgesvx.f cgetc2.f cgetri.f + cggbak.f cggbal.f cgges.f cggesx.f cggev.f cggevx.f cggglm.f + cgghrd.f cgglse.f cggqrf.f cggrqf.f + cggsvd.f cggsvp.f + cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f + chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f + checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f + chegv.f chegvd.f chegvx.f cherfs.f chesv.f chesvx.f chetd2.f + chetf2.f chetrd.f + chetrf.f chetri.f chetri2.f chetri2x.f cheswapr.f + chetrs.f chetrs2.f + chetf2_rook.f chetrf_rook.f chetri_rook.f chetrs_rook.f checon_rook.f chesv_rook.f + chgeqz.f chpcon.f chpev.f chpevd.f + chpevx.f chpgst.f chpgv.f chpgvd.f chpgvx.f chprfs.f chpsv.f + chpsvx.f + chptrd.f chptrf.f chptri.f chptrs.f chsein.f chseqr.f clabrd.f + clacgv.f clacon.f clacn2.f clacp2.f clacpy.f clacrm.f clacrt.f cladiv.f + claed0.f claed7.f claed8.f + claein.f claesy.f claev2.f clags2.f clagtm.f + clahef.f clahef_rook.f clahqr.f + clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f + clanhb.f clanhe.f + clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f + clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f + claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f + claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f + claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f + clarf.f clarfb.f clarfg.f clarft.f clarfgp.f + clarfx.f clargv.f clarnv.f clarrv.f clartg.f clartv.f + clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f + clasyf.f clasyf_rook.f clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f + clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f + cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f + cposv.f cposvx.f cpstrf.f cpstf2.f + cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f + cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f + crot.f cspcon.f csprfs.f cspsv.f + cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f + cstegr.f cstein.f csteqr.f + csycon.f + csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f csytri2.f csytri2x.f + csyswapr.f csytrs.f csytrs2.f csyconv.f + csytf2_rook.f csytrf_rook.f csytrs_rook.f + csytri_rook.f csycon_rook.f csysv_rook.f + ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f + ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f + ctprfs.f ctptri.f + ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f + ctrsyl.f ctrtrs.f ctzrqf.f ctzrzf.f cung2l.f cung2r.f + cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f + cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f + cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f + cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f + chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f + ctfttr.f ctpttf.f ctpttr.f ctrttf.f ctrttp.f + cgeequb.f cgbequb.f csyequb.f cpoequb.f cheequb.f + cbbcsd.f clapmr.f cunbdb.f cunbdb1.f cunbdb2.f cunbdb3.f cunbdb4.f + cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f + cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f + ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cpotri.f ) +set(ZCLASRC cpotrs.f) + set(DLASRC - dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f - dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f - dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f - dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f - dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f - dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f - dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f - dgetc2.f dgetri.f - dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f - dggglm.f dgghrd.f dgglse.f dggqrf.f - dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f - dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f - dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f - dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f - dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f - dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f - dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f - dlapll.f dlapmt.f - dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f - dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f - dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f - dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f - dlargv.f dlarrv.f dlartv.f - dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f - dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f - dopgtr.f dopmtr.f dorg2l.f dorg2r.f - dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f - dorgrq.f dorgtr.f dorm2l.f dorm2r.f - dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f - dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f - dpbstf.f dpbsv.f dpbsvx.f - dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f - dposvx.f dpotrs.f dpstrf.f dpstf2.f - dppcon.f dppequ.f - dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f - dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f - dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f - dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f - dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f - dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f - dstevx.f - dsycon.f dsyev.f dsyevd.f dsyevr.f - dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f - dsysv.f dsysvx.f - dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytri2.f dsytri2x.f - dsyswapr.f dsytrs.f dsytrs2.f dsyconv.f - dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f - dsytri_rook.f dsycon_rook.f dsysv_rook.f - dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f - dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f - dtptrs.f - dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f - dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f - dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f - dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f - dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f - dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f - dgeequb.f dsyequb.f dpoequb.f dgbequb.f - dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f - dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f - dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f - dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f + dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f + dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f + dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f + dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f + dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f + dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f + dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f + dgetc2.f dgetri.f + dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f + dggglm.f dgghrd.f dgglse.f dggqrf.f + dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f + dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f + dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f + dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f + dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f + dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f + dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f + dlapll.f dlapmt.f + dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f + dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f + dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f + dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f + dlargv.f dlarrv.f dlartv.f + dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f + dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f + dopgtr.f dopmtr.f dorg2l.f dorg2r.f + dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f + dorgrq.f dorgtr.f dorm2l.f dorm2r.f + dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f + dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f + dpbstf.f dpbsv.f dpbsvx.f + dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f + dposvx.f dpotrs.f dpstrf.f dpstf2.f + dppcon.f dppequ.f + dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f + dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f + dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f + dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f + dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f + dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f + dstevx.f + dsycon.f dsyev.f dsyevd.f dsyevr.f + dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f + dsysv.f dsysvx.f + dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytri2.f dsytri2x.f + dsyswapr.f dsytrs.f dsytrs2.f dsyconv.f + dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f + dsytri_rook.f dsycon_rook.f dsysv_rook.f + dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f + dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f + dtptrs.f + dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f + dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f + dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f + dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f + dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f + dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f + dgeequb.f dsyequb.f dpoequb.f dgbequb.f + dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f + dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f + dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f + dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f +) + +set(ZLASRC + zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f + zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f + zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f + zgegs.f zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f + zgels.f zgelsd.f zgelss.f zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f + zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f + zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f + zgetri.f + zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f + zgghrd.f zgglse.f zggqrf.f zggrqf.f + zggsvd.f zggsvp.f + zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f + zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f + zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f + zhegv.f zhegvd.f zhegvx.f zherfs.f zhesv.f zhesvx.f zhetd2.f + zhetf2.f zhetrd.f + zhetrf.f zhetri.f zhetri2.f zhetri2x.f zheswapr.f + zhetrs.f zhetrs2.f + zhetf2_rook.f zhetrf_rook.f zhetri_rook.f zhetrs_rook.f zhecon_rook.f zhesv_rook.f + zhgeqz.f zhpcon.f zhpev.f zhpevd.f + zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f + zhpsvx.f + zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f + zlacgv.f zlacon.f zlacn2.f zlacp2.f zlacpy.f zlacrm.f zlacrt.f zladiv.f + zlaed0.f zlaed7.f zlaed8.f + zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f + zlahef.f zlahef_rook.f zlahqr.f + zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f + zlangt.f zlanhb.f + zlanhe.f + zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f + zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f + zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f + zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f + zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f + zlarcm.f zlarf.f zlarfb.f + zlarfg.f zlarft.f zlarfgp.f + zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f + zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f + zlassq.f zlasyf.f zlasyf_rook.f + zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlatzm.f + zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f + zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f + zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f + zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f + zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f + zrot.f zspcon.f zsprfs.f zspsv.f + zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f + zstegr.f zstein.f zsteqr.f + zsycon.f + zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f zsytri2.f zsytri2x.f + zsyswapr.f zsytrs.f zsytrs2.f zsyconv.f + zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f + zsytri_rook.f zsycon_rook.f zsysv_rook.f + ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f + ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f + ztprfs.f ztptri.f + ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f + ztrsyl.f ztrtrs.f ztzrqf.f ztzrzf.f zung2l.f + zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f + zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f + zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f + zunmtr.f zupgtr.f + zupmtr.f izmax1.f dzsum1.f zstemr.f + zcgesv.f zcposv.f zlag2c.f clag2z.f zlat2c.f + zhfrk.f ztfttp.f zlanhf.f zpftrf.f zpftri.f zpftrs.f ztfsm.f ztftri.f + ztfttr.f ztpttf.f ztpttr.f ztrttf.f ztrttp.f + zgeequb.f zgbequb.f zsyequb.f zpoequb.f zheequb.f + zbbcsd.f zlapmr.f zunbdb.f zunbdb1.f zunbdb2.f zunbdb3.f zunbdb4.f + zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f + zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f + ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f zpotri.f ) -set(LA_REL_SRC ${ALLAUX} ${DZLAUX} ${DLASRC}) +set(LA_REL_SRC ${ALLAUX}) +if (BUILD_SINGLE) + list(APPEND LA_REL_SRC ${SLASRC} ${DSLASRC} ${SCLAUX}) +endif () + +if (BUILD_DOUBLE) + list(APPEND LA_REL_SRC ${DLASRC} ${DSLASRC} ${DZLAUX}) +endif () + +if (BUILD_COMPLEX) + list(APPEND LA_REL_SRC ${CLASRC} ${ZCLASRC} ${SCLAUX}) +endif () + +if (BUILD_COMPLEX16) + list(APPEND LA_REL_SRC ${ZLASRC} ${ZCLASRC} ${DZLAUX}) +endif () # add lapack-netlib folder to the sources set(LA_SOURCES "") diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index be86094f1..f2b66ba1d 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -88,6 +88,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMOTCOPY}" "" "${${float_char}GEMMOTCOPYOBJ}" false "" "" false ${float_type}) endif () + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_BETA}" "" "" false "" "" false ${float_type}) GenerateCombinationObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false ${float_type}) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 26922f50e..7e17de7de 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -58,11 +58,11 @@ set(ZLAPACK_SOURCES ) GenerateNamedObjects("${LAPACK_SOURCES}") -GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" "" 3) +GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) # TODO: laswp needs arch specific code -GenerateNamedObjects("laswp/generic/laswp_k.c" "" "laswp_plus") -GenerateNamedObjects("laswp/generic/laswp_k.c" "MINUS" "laswp_minus") +GenerateNamedObjects("laswp/generic/laswp_k.c" "" "laswp_plus" false "" "" false 3) +GenerateNamedObjects("laswp/generic/laswp_k.c" "MINUS" "laswp_minus" false "" "" false 3) if (SMP) From e27c372e53fcaceed66193440fbbd450a8d6e251 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 19 Feb 2015 13:53:29 -0600 Subject: [PATCH 085/257] Fixed reuse of float_char from parent loop. Fixed in/it/on/otcopy names. --- cmake/utils.cmake | 1 + kernel/CMakeLists.txt | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index c77b762e6..b8f166fb0 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -165,6 +165,7 @@ function(GenerateNamedObjects sources_in) list(REMOVE_ITEM float_list "ZCOMPLEX") endif () + set(float_char "") set(OBJ_LIST_OUT "") foreach (float_type ${float_list}) foreach (source_file ${sources_in}) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index f2b66ba1d..50dbabb91 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -73,19 +73,19 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) if (${float_char}GEMMINCOPY) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "" "${${float_char}GEMMINCOPYOBJ}" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () if (${float_char}GEMMITCOPY) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMITCOPY}" "" "${${float_char}GEMMITCOPYOBJ}" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMITCOPY}" "${float_type}" "${${float_char}GEMMITCOPYOBJ}" false "" "" true ${float_type}) endif () if (${float_char}GEMMONCOPY) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMONCOPY}" "" "${${float_char}GEMMONCOPYOBJ}" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMONCOPY}" "${float_type}" "${${float_char}GEMMONCOPYOBJ}" false "" "" true ${float_type}) endif () if (${float_char}GEMMOTCOPY) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMOTCOPY}" "" "${${float_char}GEMMOTCOPYOBJ}" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMOTCOPY}" "${float_type}" "${${float_char}GEMMOTCOPYOBJ}" false "" "" true ${float_type}) endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_BETA}" "" "" false "" "" false ${float_type}) From 714638c187deff177fb6045634f7236218c610b7 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 19 Feb 2015 16:11:51 -0600 Subject: [PATCH 086/257] Added some TRMM objects for complex types. --- cmake/utils.cmake | 1 - kernel/CMakeLists.txt | 43 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index b8f166fb0..b706dfc03 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -202,7 +202,6 @@ function(GenerateNamedObjects sources_in) endif () list(APPEND obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CHAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") - list(APPEND obj_defines ${defines_in}) if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "DOUBLE") endif () diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 50dbabb91..d91b288fd 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -68,6 +68,17 @@ GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) # Makefile.L3 +set(USE_GEMM3M false) +set(USE_TRMM false) + +if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64" OR ${ARCH} STREQUAL "ia64" OR ${ARCH} STREQUAL "MIPS") + set(USE_GEMM3M true) +endif () + +if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC") + set(USE_TRMM true) +endif () + foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) @@ -88,12 +99,32 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMOTCOPY}" "${float_type}" "${${float_char}GEMMOTCOPYOBJ}" false "" "" true ${float_type}) endif () - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_BETA}" "" "" false "" "" false ${float_type}) - GenerateCombinationObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_BETA}" "" "gemm_beta" false "" "" false ${float_type}) + + if (USE_TRMM) + set(TRMM_KERNEL "${${float_char}TRMMKERNEL}") + else () + set(TRMM_KERNEL "${${float_char}GEMMKERNEL}") + endif () + + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + # screw it, just enumerate all these. there is an extra define for these indicating which side is a conjugate (e.g. CN NC NN) that I don't really want to work into GenerateCombinationObjects + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;NN" "trmm_kernel_LN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA;NN" "trmm_kernel_LT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;CONJ;CN" "trmm_kernel_LR" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA;CONJ;CN" "trmm_kernel_LC" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "NN" "trmm_kernel_RN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRANSA;NN" "trmm_kernel_RT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "CONJ;NC" "trmm_kernel_RR" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRANSA;CONJ;NC" "trmm_kernel_RC" false "" "" false ${float_type}) + else () + GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) + endif () + + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. From e5897ecb9bb152c5ea25491a14f838ae1d90b0a1 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 19 Feb 2015 16:19:56 -0600 Subject: [PATCH 087/257] Added zherk_kernel.c objects to driver/level3. --- driver/level3/CMakeLists.txt | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 7259a87e7..53c72538b 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -3,19 +3,7 @@ include_directories(${CMAKE_SOURCE_DIR}) set(USE_GEMM3M 0) if (DEFINED ARCH) - if (${ARCH} STREQUAL "x86") - set(USE_GEMM3M 1) - endif () - - if (${ARCH} STREQUAL "x86_64") - set(USE_GEMM3M 1) - endif () - - if (${ARCH} STREQUAL "ia64") - set(USE_GEMM3M 1) - endif () - - if (${ARCH} STREQUAL "MIPS") + if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64" OR ${ARCH} STREQUAL "ia64" OR ${ARCH} STREQUAL "MIPS") set(USE_GEMM3M 1) endif () endif () @@ -48,6 +36,14 @@ if (SMP) endif () endif () +foreach (float_type ${FLOAT_TYPES}) + set(VERBOSE_GEN true) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateCombinationObjects("zherk_kernel" "LOWER;CONJ" "U;N" "HERK" 2 "herk_kernel" false ${float_type}) + endif () + set(VERBOSE_GEN false) +endforeach () + #HPLOBJS = # dgemm_nn.c dgemm_nt.c dgemm_tn.c dgemm_tt.c # dtrsm_LNUU.c dtrsm_LNUN.c dtrsm_LNLU.c dtrsm_LNLN.c From 8a143516e33927fd0ef047452313e2d9026c2f89 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Fri, 20 Feb 2015 17:03:33 -0600 Subject: [PATCH 088/257] Added alternate_name to a couple of the name mangling schemes. Added zherk_k sources to driver/level3. --- cmake/utils.cmake | 17 ++++++++++++++--- driver/level3/CMakeLists.txt | 9 ++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index b706dfc03..11f2babd5 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -241,14 +241,17 @@ endfunction () # @param complex_filename_scheme see GenerateNamedObjects function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) + set(alternate_name_in "") if (DEFINED ARGV5) - set(alternate_name ${ARGV5}) + set(alternate_name_in ${ARGV5}) endif () + set(no_float_type false) if (DEFINED ARGV6) set(no_float_type ${ARGV6}) endif () + set(complex_filename_scheme "") if (DEFINED ARGV7) set(complex_filename_scheme ${ARGV7}) endif () @@ -268,6 +271,8 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_de foreach (source_file ${sources_in}) + set(alternate_name ${alternate_name_in}) + # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with string(REPLACE ":" ";" define_combo ${define_combo}) @@ -287,8 +292,12 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_de if (replace_scheme EQUAL 2) set(append_code "_${define_code}") elseif (replace_scheme EQUAL 3) + if ("${alternate_name}" STREQUAL "") + string(REGEX MATCH "[a-zA-Z]\\." last_letter ${source_file}) + else () + string(REGEX MATCH "[a-zA-Z]$" last_letter ${alternate_name}) + endif () # first extract the last letter - string(REGEX MATCH "[a-zA-Z]\\." last_letter ${source_file}) string(SUBSTRING ${last_letter} 0 1 last_letter) # remove period from match # break the code up into the first letter and the remaining (should only be 2 anyway) string(SUBSTRING ${define_code} 0 1 define_code_first) @@ -296,7 +305,9 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_de set(replace_code "${define_code_first}${last_letter}${define_code_second}") elseif (replace_scheme EQUAL 4) # insert code before the last underscore and pass that in as the alternate_name - get_filename_component(alternate_name ${source_file} NAME_WE) + if ("${alternate_name}" STREQUAL "") + get_filename_component(alternate_name ${source_file} NAME_WE) + endif () set(extra_underscore "") # check if filename has two underscores, insert another if not (e.g. getrs_parallel needs to become getrs_U_parallel not getrsU_parallel) string(REGEX MATCH "_[a-zA-Z]+_" underscores ${alternate_name}) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 53c72538b..d9c66db59 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -37,11 +37,14 @@ if (SMP) endif () foreach (float_type ${FLOAT_TYPES}) - set(VERBOSE_GEN true) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") - GenerateCombinationObjects("zherk_kernel" "LOWER;CONJ" "U;N" "HERK" 2 "herk_kernel" false ${float_type}) + GenerateCombinationObjects("zherk_kernel.c" "LOWER;CONJ" "U;N" "HERK" 2 "herk_kernel" false ${float_type}) + # TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination + GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK" 3 "herk_N" false ${float_type}) + GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;TRANS;CONJ" 3 "herk_C" false ${float_type}) + GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type}) + GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type}) endif () - set(VERBOSE_GEN false) endforeach () #HPLOBJS = From 371071d461e37f2a1c62a6cec8ac40ad2190b75e Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Sat, 21 Feb 2015 10:59:02 -0600 Subject: [PATCH 089/257] Added CONJ defines for trmm/trsm. --- driver/level3/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index d9c66db59..85bde071d 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -44,6 +44,15 @@ foreach (float_type ${FLOAT_TYPES}) GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;TRANS;CONJ" 3 "herk_C" false ${float_type}) GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type}) GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type}) + # Need to set CONJ for trmm and trsm + GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_LR" false ${float_type}) + GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trmm_LC" false ${float_type}) + GenerateCombinationObjects("trmm_R.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_RR" false ${float_type}) + GenerateCombinationObjects("trmm_R.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trmm_RC" false ${float_type}) + GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trsm_LR" false ${float_type}) + GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trsm_LC" false ${float_type}) + GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trsm_RR" false ${float_type}) + GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trsm_RC" false ${float_type}) endif () endforeach () From fb5d5bb9717e5dde95857cdc03e0e7f0dd86f246 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Sat, 21 Feb 2015 12:39:03 -0600 Subject: [PATCH 090/257] Added defines for complex trmv. --- driver/level2/CMakeLists.txt | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index a1685dbd6..8b37917a6 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -17,13 +17,11 @@ set(NU_SOURCES tbsv_U.c tpmv_U.c tpsv_U.c - trmv_U.c trsv_U.c tbmv_L.c tbsv_L.c tpmv_L.c tpsv_L.c - trmv_L.c trsv_L.c ) @@ -38,6 +36,22 @@ GenerateCombinationObjects("${NU_SOURCES}" "TRANSA;UNIT" "N;N" "" 3 "" "" 3) GenerateNamedObjects("gbmv_k.c" "" "gbmv_n" false "" "" "" 3) GenerateNamedObjects("gbmv_k.c" "TRANS" "gbmv_t" false "" "" "" 3) +# special defines for complex trmv +foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateCombinationObjects("ztrmv_U.c" "UNIT" "N" "TRANSA=1" 0 "trmv_NU" false ${float_type}) + GenerateCombinationObjects("ztrmv_U.c" "UNIT" "N" "TRANSA=2" 0 "trmv_TL" false ${float_type}) + GenerateCombinationObjects("ztrmv_U.c" "UNIT" "N" "TRANSA=3" 0 "trmv_RU" false ${float_type}) + GenerateCombinationObjects("ztrmv_U.c" "UNIT" "N" "TRANSA=4" 0 "trmv_CL" false ${float_type}) + GenerateCombinationObjects("ztrmv_L.c" "UNIT" "N" "TRANSA=1" 0 "trmv_NL" false ${float_type}) + GenerateCombinationObjects("ztrmv_L.c" "UNIT" "N" "TRANSA=2" 0 "trmv_TU" false ${float_type}) + GenerateCombinationObjects("ztrmv_L.c" "UNIT" "N" "TRANSA=3" 0 "trmv_RL" false ${float_type}) + GenerateCombinationObjects("ztrmv_L.c" "UNIT" "N" "TRANSA=4" 0 "trmv_CU" false ${float_type}) + else () + GenerateCombinationObjects("trmv_U.c;trmv_L.c" "TRANSA;UNIT" "N;N" "" 3 "" false ${float_type}) + endif () +endforeach () + if (SMP) # gbmv uses a lowercase n and t. N.B. this uses TRANSA where gbmv.c uses TRANS. Intentional? @@ -69,6 +83,13 @@ if (SMP) GenerateCombinationObjects("${NU_SMP_SOURCES}" "TRANSA;LOWER;UNIT" "N;U;N" "" 2) + foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateCombinationObjects("trmv_thread.c" "LOWER;UNIT" "U;N" "TRANSA=3" 0 "trmv_thread_R" false ${float_type}) + GenerateCombinationObjects("trmv_thread.c" "LOWER;UNIT" "U;N" "TRANSA=4" 0 "trmv_thread_C" false ${float_type}) + endif () + endforeach () + endif () set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS From a6116e585974950cc5a4ccbadf519fb647767455 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Sun, 22 Feb 2015 17:49:28 -0600 Subject: [PATCH 091/257] Added some more complex-only objects. --- cmake/utils.cmake | 2 +- interface/CMakeLists.txt | 18 ++++++++++++++++++ kernel/CMakeLists.txt | 11 ++++++++--- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 11f2babd5..edf25cdb2 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -321,7 +321,7 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_de endif () endif () - GenerateNamedObjects("${source_file}" "${cur_defines}" "${alternate_name}" 0 "${replace_code}" "${append_code}" "${no_float_type}" "${complex_filename_scheme}") + GenerateNamedObjects("${source_file}" "${cur_defines}" "${alternate_name}" false "${replace_code}" "${append_code}" "${no_float_type}" "${complex_filename_scheme}") list(APPEND COMBO_OBJ_LIST_OUT "${OBJ_LIST_OUT}") endforeach () endforeach () diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 633b8a6fe..33464c3cd 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -84,6 +84,24 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) endforeach () +# complex-specific sources +foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("zger.c" "" "geru" false "" "" false ${float_type}) + GenerateNamedObjects("zger.c" "CONJ" "gerc" false "" "" false ${float_type}) + endif () + if (${float_type} STREQUAL "COMPLEX") + GenerateNamedObjects("zscal.c" "SSCAL" "sscal" false "" "" false "COMPLEX") + GenerateNamedObjects("nrm2.c" "" "scnrm2" false "" "" true "COMPLEX") + endif () + + if (${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("zscal.c" "SSCAL" "dscal" false "" "" false "ZCOMPLEX") + GenerateNamedObjects("nrm2.c" "" "dznrm2" false "" "" true "ZCOMPLEX") + endif () +endforeach () + + if (NOT DEFINED NO_LAPACK) set(LAPACK_SOURCES lapack/gesv.c diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d91b288fd..84ec428ba 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -16,8 +16,6 @@ else () endif () SetDefaultL1() -#-include $(KERNELDIR)/KERNEL.$(TARGET_CORE) -#include $(KERNELDIR)/KERNEL ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") ParseMakefileVars("${KERNELDIR}/KERNEL") @@ -53,12 +51,19 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${${float_char}ASUMKERNEL}" "" "asum_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "" "axpy_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "" "dot_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) + + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "" "dotu_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "CONJ" "dotc_k" false "" "" false ${float_type}) + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "" "dot_k" false "" "" false ${float_type}) + endif () endforeach () # Makefile.L2 From b2284647a3dc3192d08763b835f69bd6df61ea04 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 23 Feb 2015 07:51:05 -0600 Subject: [PATCH 092/257] More complex objects. --- interface/CMakeLists.txt | 1 - kernel/CMakeLists.txt | 24 +++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 33464c3cd..1ca554307 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -94,7 +94,6 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("zscal.c" "SSCAL" "sscal" false "" "" false "COMPLEX") GenerateNamedObjects("nrm2.c" "" "scnrm2" false "" "" true "COMPLEX") endif () - if (${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("zscal.c" "SSCAL" "dscal" false "" "" false "ZCOMPLEX") GenerateNamedObjects("nrm2.c" "" "dznrm2" false "" "" true "ZCOMPLEX") diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 84ec428ba..656090cf8 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -71,6 +71,15 @@ GenerateNamedObjects("${KERNELDIR}/gemv_n.S" "DOUBLE") GenerateNamedObjects("${KERNELDIR}/gemv_t.S" "TRANS") GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) +foreach (float_type ${FLOAT_TYPES}) + string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "XCONJ" "gerv_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ;XCONJ" "gerd_k" false "" "" false ${float_type}) + endif () +endforeach () # Makefile.L3 set(USE_GEMM3M false) @@ -113,7 +122,14 @@ foreach (float_type ${FLOAT_TYPES}) endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") - # screw it, just enumerate all these. there is an extra define for these indicating which side is a conjugate (e.g. CN NC NN) that I don't really want to work into GenerateCombinationObjects + + # just enumerate all these. there is an extra define for these indicating which side is a conjugate (e.g. CN NC NN) that I don't really want to work into GenerateCombinationObjects + + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "NN" "gemm_kernel_n" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "CN" "gemm_kernel_l" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "NC" "gemm_kernel_r" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "CC" "gemm_kernel_b" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;NN" "trmm_kernel_LN" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA;NN" "trmm_kernel_LT" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;CONJ;CN" "trmm_kernel_LR" false "" "" false ${float_type}) @@ -122,6 +138,12 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRANSA;NN" "trmm_kernel_RT" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "CONJ;NC" "trmm_kernel_RR" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRANSA;CONJ;NC" "trmm_kernel_RC" false "" "" false ${float_type}) + + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL;CONJ" "trsm_kernel_LR" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "UPPER;LT;TRSMKERNEL;CONJ" "trsm_kernel_LC" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RR" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RC" false "" "" false ${float_type}) + else () GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) endif () From 1b7f427401a93626097304b5d84f78163e478c5e Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Mon, 23 Feb 2015 10:24:31 -0600 Subject: [PATCH 093/257] Added conj gemv objects for complex build. --- cmake/kernel.cmake | 268 +++++++++++++++++++++-------------- driver/level2/CMakeLists.txt | 39 +++-- kernel/CMakeLists.txt | 15 +- 3 files changed, 202 insertions(+), 120 deletions(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 211da229d..3a4d13837 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -1,110 +1,158 @@ -# helper functions for the kernel CMakeLists.txt - - -# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. -macro(SetDefaultL1) - set(SAMAXKERNEL amax.S) - set(DAMAXKERNEL amax.S) - set(QAMAXKERNEL amax.S) - set(CAMAXKERNEL zamax.S) - set(ZAMAXKERNEL zamax.S) - set(XAMAXKERNEL zamax.S) - set(SAMINKERNEL amin.S) - set(DAMINKERNEL amin.S) - set(QAMINKERNEL amin.S) - set(CAMINKERNEL zamin.S) - set(ZAMINKERNEL zamin.S) - set(XAMINKERNEL zamin.S) - set(SMAXKERNEL max.S) - set(DMAXKERNEL max.S) - set(QMAXKERNEL max.S) - set(SMINKERNEL min.S) - set(DMINKERNEL min.S) - set(QMINKERNEL min.S) - set(ISAMAXKERNEL iamax.S) - set(IDAMAXKERNEL iamax.S) - set(IQAMAXKERNEL iamax.S) - set(ICAMAXKERNEL izamax.S) - set(IZAMAXKERNEL izamax.S) - set(IXAMAXKERNEL izamax.S) - set(ISAMINKERNEL iamin.S) - set(IDAMINKERNEL iamin.S) - set(IQAMINKERNEL iamin.S) - set(ICAMINKERNEL izamin.S) - set(IZAMINKERNEL izamin.S) - set(IXAMINKERNEL izamin.S) - set(ISMAXKERNEL iamax.S) - set(IDMAXKERNEL iamax.S) - set(IQMAXKERNEL iamax.S) - set(ISMINKERNEL iamin.S) - set(IDMINKERNEL iamin.S) - set(IQMINKERNEL iamin.S) - set(SASUMKERNEL asum.S) - set(DASUMKERNEL asum.S) - set(CASUMKERNEL zasum.S) - set(ZASUMKERNEL zasum.S) - set(QASUMKERNEL asum.S) - set(XASUMKERNEL zasum.S) - set(SAXPYKERNEL axpy.S) - set(DAXPYKERNEL axpy.S) - set(CAXPYKERNEL zaxpy.S) - set(ZAXPYKERNEL zaxpy.S) - set(QAXPYKERNEL axpy.S) - set(XAXPYKERNEL zaxpy.S) - set(SCOPYKERNEL copy.S) - set(DCOPYKERNEL copy.S) - set(CCOPYKERNEL zcopy.S) - set(ZCOPYKERNEL zcopy.S) - set(QCOPYKERNEL copy.S) - set(XCOPYKERNEL zcopy.S) - set(SDOTKERNEL dot.S) - set(DDOTKERNEL dot.S) - set(CDOTKERNEL zdot.S) - set(ZDOTKERNEL zdot.S) - set(QDOTKERNEL dot.S) - set(XDOTKERNEL zdot.S) - set(SNRM2KERNEL nrm2.S) - set(DNRM2KERNEL nrm2.S) - set(QNRM2KERNEL nrm2.S) - set(CNRM2KERNEL znrm2.S) - set(ZNRM2KERNEL znrm2.S) - set(XNRM2KERNEL znrm2.S) - set(SROTKERNEL rot.S) - set(DROTKERNEL rot.S) - set(QROTKERNEL rot.S) - set(CROTKERNEL zrot.S) - set(ZROTKERNEL zrot.S) - set(XROTKERNEL zrot.S) - set(SSCALKERNEL scal.S) - set(DSCALKERNEL scal.S) - set(CSCALKERNEL zscal.S) - set(ZSCALKERNEL zscal.S) - set(QSCALKERNEL scal.S) - set(XSCALKERNEL zscal.S) - set(SSWAPKERNEL swap.S) - set(DSWAPKERNEL swap.S) - set(CSWAPKERNEL zswap.S) - set(ZSWAPKERNEL zswap.S) - set(QSWAPKERNEL swap.S) - set(XSWAPKERNEL zswap.S) - set(SGEMVNKERNEL gemv_n.S) - set(SGEMVTKERNEL gemv_t.S) - set(DGEMVNKERNEL gemv_n.S) - set(DGEMVTKERNEL gemv_t.S) - set(CGEMVNKERNEL zgemv_n.S) - set(CGEMVTKERNEL zgemv_t.S) - set(ZGEMVNKERNEL zgemv_n.S) - set(ZGEMVTKERNEL zgemv_t.S) - set(QGEMVNKERNEL gemv_n.S) - set(QGEMVTKERNEL gemv_t.S) - set(XGEMVNKERNEL zgemv_n.S) - set(XGEMVTKERNEL zgemv_t.S) - set(SCABS_KERNEL cabs.S) - set(DCABS_KERNEL cabs.S) - set(QCABS_KERNEL cabs.S) - set(LSAME_KERNEL lsame.S) - set(SAXPBYKERNEL ../arm/axpby.c) - set(DAXPBYKERNEL ../arm/axpby.c) - set(CAXPBYKERNEL ../arm/zaxpby.c) - set(ZAXPBYKERNEL ../arm/zaxpby.c) -endmacro () +# helper functions for the kernel CMakeLists.txt + + +# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. +macro(SetDefaultL1) + set(SAMAXKERNEL amax.S) + set(DAMAXKERNEL amax.S) + set(QAMAXKERNEL amax.S) + set(CAMAXKERNEL zamax.S) + set(ZAMAXKERNEL zamax.S) + set(XAMAXKERNEL zamax.S) + set(SAMINKERNEL amin.S) + set(DAMINKERNEL amin.S) + set(QAMINKERNEL amin.S) + set(CAMINKERNEL zamin.S) + set(ZAMINKERNEL zamin.S) + set(XAMINKERNEL zamin.S) + set(SMAXKERNEL max.S) + set(DMAXKERNEL max.S) + set(QMAXKERNEL max.S) + set(SMINKERNEL min.S) + set(DMINKERNEL min.S) + set(QMINKERNEL min.S) + set(ISAMAXKERNEL iamax.S) + set(IDAMAXKERNEL iamax.S) + set(IQAMAXKERNEL iamax.S) + set(ICAMAXKERNEL izamax.S) + set(IZAMAXKERNEL izamax.S) + set(IXAMAXKERNEL izamax.S) + set(ISAMINKERNEL iamin.S) + set(IDAMINKERNEL iamin.S) + set(IQAMINKERNEL iamin.S) + set(ICAMINKERNEL izamin.S) + set(IZAMINKERNEL izamin.S) + set(IXAMINKERNEL izamin.S) + set(ISMAXKERNEL iamax.S) + set(IDMAXKERNEL iamax.S) + set(IQMAXKERNEL iamax.S) + set(ISMINKERNEL iamin.S) + set(IDMINKERNEL iamin.S) + set(IQMINKERNEL iamin.S) + set(SASUMKERNEL asum.S) + set(DASUMKERNEL asum.S) + set(CASUMKERNEL zasum.S) + set(ZASUMKERNEL zasum.S) + set(QASUMKERNEL asum.S) + set(XASUMKERNEL zasum.S) + set(SAXPYKERNEL axpy.S) + set(DAXPYKERNEL axpy.S) + set(CAXPYKERNEL zaxpy.S) + set(ZAXPYKERNEL zaxpy.S) + set(QAXPYKERNEL axpy.S) + set(XAXPYKERNEL zaxpy.S) + set(SCOPYKERNEL copy.S) + set(DCOPYKERNEL copy.S) + set(CCOPYKERNEL zcopy.S) + set(ZCOPYKERNEL zcopy.S) + set(QCOPYKERNEL copy.S) + set(XCOPYKERNEL zcopy.S) + set(SDOTKERNEL dot.S) + set(DDOTKERNEL dot.S) + set(CDOTKERNEL zdot.S) + set(ZDOTKERNEL zdot.S) + set(QDOTKERNEL dot.S) + set(XDOTKERNEL zdot.S) + set(SNRM2KERNEL nrm2.S) + set(DNRM2KERNEL nrm2.S) + set(QNRM2KERNEL nrm2.S) + set(CNRM2KERNEL znrm2.S) + set(ZNRM2KERNEL znrm2.S) + set(XNRM2KERNEL znrm2.S) + set(SROTKERNEL rot.S) + set(DROTKERNEL rot.S) + set(QROTKERNEL rot.S) + set(CROTKERNEL zrot.S) + set(ZROTKERNEL zrot.S) + set(XROTKERNEL zrot.S) + set(SSCALKERNEL scal.S) + set(DSCALKERNEL scal.S) + set(CSCALKERNEL zscal.S) + set(ZSCALKERNEL zscal.S) + set(QSCALKERNEL scal.S) + set(XSCALKERNEL zscal.S) + set(SSWAPKERNEL swap.S) + set(DSWAPKERNEL swap.S) + set(CSWAPKERNEL zswap.S) + set(ZSWAPKERNEL zswap.S) + set(QSWAPKERNEL swap.S) + set(XSWAPKERNEL zswap.S) + set(SGEMVNKERNEL gemv_n.S) + set(SGEMVTKERNEL gemv_t.S) + set(DGEMVNKERNEL gemv_n.S) + set(DGEMVTKERNEL gemv_t.S) + set(CGEMVNKERNEL zgemv_n.S) + set(CGEMVTKERNEL zgemv_t.S) + set(ZGEMVNKERNEL zgemv_n.S) + set(ZGEMVTKERNEL zgemv_t.S) + set(QGEMVNKERNEL gemv_n.S) + set(QGEMVTKERNEL gemv_t.S) + set(XGEMVNKERNEL zgemv_n.S) + set(XGEMVTKERNEL zgemv_t.S) + set(SCABS_KERNEL cabs.S) + set(DCABS_KERNEL cabs.S) + set(QCABS_KERNEL cabs.S) + set(LSAME_KERNEL lsame.S) + set(SAXPBYKERNEL ../arm/axpby.c) + set(DAXPBYKERNEL ../arm/axpby.c) + set(CAXPBYKERNEL ../arm/zaxpby.c) + set(ZAXPBYKERNEL ../arm/zaxpby.c) +endmacro () + +macro(SetDefaultL2) + set(SGEMVNKERNEL gemv_n.S) + set(SGEMVTKERNEL gemv_t.S) + set(DGEMVNKERNEL gemv_n.S) + set(DGEMVTKERNEL gemv_t.S) + set(CGEMVNKERNEL zgemv_n.S) + set(CGEMVTKERNEL zgemv_t.S) + set(ZGEMVNKERNEL zgemv_n.S) + set(ZGEMVTKERNEL zgemv_t.S) + set(QGEMVNKERNEL gemv_n.S) + set(QGEMVTKERNEL gemv_t.S) + set(XGEMVNKERNEL zgemv_n.S) + set(XGEMVTKERNEL zgemv_t.S) + set(SGERKERNEL ../generic/ger.c) + set(DGERKERNEL ../generic/ger.c) + set(QGERKERNEL ../generic/ger.c) + set(CGERUKERNEL ../generic/zger.c) + set(CGERCKERNEL ../generic/zger.c) + set(ZGERUKERNEL ../generic/zger.c) + set(ZGERCKERNEL ../generic/zger.c) + set(XGERUKERNEL ../generic/zger.c) + set(XGERCKERNEL ../generic/zger.c) + set(SSYMV_U_KERNEL ../generic/symv_k.c) + set(SSYMV_L_KERNEL ../generic/symv_k.c) + set(DSYMV_U_KERNEL ../generic/symv_k.c) + set(DSYMV_L_KERNEL ../generic/symv_k.c) + set(QSYMV_U_KERNEL ../generic/symv_k.c) + set(QSYMV_L_KERNEL ../generic/symv_k.c) + set(CSYMV_U_KERNEL ../generic/zsymv_k.c) + set(CSYMV_L_KERNEL ../generic/zsymv_k.c) + set(ZSYMV_U_KERNEL ../generic/zsymv_k.c) + set(ZSYMV_L_KERNEL ../generic/zsymv_k.c) + set(XSYMV_U_KERNEL ../generic/zsymv_k.c) + set(XSYMV_L_KERNEL ../generic/zsymv_k.c) + set(CHEMV_U_KERNEL ../generic/zhemv_k.c) + set(CHEMV_L_KERNEL ../generic/zhemv_k.c) + set(CHEMV_V_KERNEL ../generic/zhemv_k.c) + set(CHEMV_M_KERNEL ../generic/zhemv_k.c) + set(ZHEMV_U_KERNEL ../generic/zhemv_k.c) + set(ZHEMV_L_KERNEL ../generic/zhemv_k.c) + set(ZHEMV_V_KERNEL ../generic/zhemv_k.c) + set(ZHEMV_M_KERNEL ../generic/zhemv_k.c) + set(XHEMV_U_KERNEL ../generic/zhemv_k.c) + set(XHEMV_L_KERNEL ../generic/zhemv_k.c) + set(XHEMV_V_KERNEL ../generic/zhemv_k.c) + set(XHEMV_M_KERNEL ../generic/zhemv_k.c) +endmacro () diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 8b37917a6..54e0eb42f 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -54,14 +54,6 @@ endforeach () if (SMP) - # gbmv uses a lowercase n and t. N.B. this uses TRANSA where gbmv.c uses TRANS. Intentional? - GenerateNamedObjects("gbmv_thread.c" "" "gbmv_thread_n") - GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t") - - GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n") - GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t") - - GenerateNamedObjects("ger_thread.c") set(UL_SMP_SOURCES symv_thread.c @@ -84,10 +76,41 @@ if (SMP) GenerateCombinationObjects("${NU_SMP_SOURCES}" "TRANSA;LOWER;UNIT" "N;U;N" "" 2) foreach (float_type ${FLOAT_TYPES}) + + GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false ${float_type}) + + GenerateNamedObjects("gbmv_thread.c" "" "gbmv_thread_n" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) + + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateCombinationObjects("trmv_thread.c" "LOWER;UNIT" "U;N" "TRANSA=3" 0 "trmv_thread_R" false ${float_type}) GenerateCombinationObjects("trmv_thread.c" "LOWER;UNIT" "U;N" "TRANSA=4" 0 "trmv_thread_C" false ${float_type}) + + GenerateNamedObjects("gemv_thread.c" "CONJ" "gemv_thread_r" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "CONJ;TRANSA" "gemv_thread_c" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "XCONJ" "gemv_thread_o" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "XCONJ;TRANSA" "gemv_thread_u" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "XCONJ;CONJ" "gemv_thread_s" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "XCONJ;CONJ;TRANSA" "gemv_thread_d" false "" "" false ${float_type}) + + GenerateNamedObjects("gbmv_thread.c" "CONJ" "gbmv_thread_r" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "CONJ;TRANSA" "gbmv_thread_c" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "XCONJ" "gbmv_thread_o" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "XCONJ;TRANSA" "gbmv_thread_u" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "XCONJ;CONJ" "gbmv_thread_s" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "XCONJ;CONJ;TRANSA" "gbmv_thread_d" false "" "" false ${float_type}) + + GenerateNamedObjects("ger_thread.c" "" "ger_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("ger_thread.c" "CONJ" "ger_thread_C" false "" "" false ${float_type}) + GenerateNamedObjects("ger_thread.c" "XCONJ" "ger_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("ger_thread.c" "XCONJ;CONJ" "ger_thread_D" false "" "" false ${float_type}) + + else () + GenerateNamedObjects("ger_thread.c" "" "" false "" "" false ${float_type}) endif () + endforeach () endif () diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 656090cf8..cacd0f38f 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -67,8 +67,7 @@ foreach (float_type ${FLOAT_TYPES}) endforeach () # Makefile.L2 -GenerateNamedObjects("${KERNELDIR}/gemv_n.S" "DOUBLE") -GenerateNamedObjects("${KERNELDIR}/gemv_t.S" "TRANS") +SetDefaultL2() GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) @@ -78,6 +77,18 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "XCONJ" "gerv_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ;XCONJ" "gerd_k" false "" "" false ${float_type}) + + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "" "gemv_n" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANSA" "gemv_t" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "CONJ" "gemv_r" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "CONJ;TRANSA" "${KERNELDIR}/${${float_char}GEMVUKERNEL}" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "XCONJ" "gemv_o" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "XCONJ;TRANSA" "gemv_u" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "XCONJ;CONJ" "gemv_s" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "XCONJ;CONJ;TRANSA" "gemv_d" false "" "" false ${float_type}) + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "" "gemv_n" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false ${float_type}) endif () endforeach () From 12d1fb2e4076bd6e8631cb0c2a5e941f038334e1 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 24 Feb 2015 10:30:16 -0600 Subject: [PATCH 094/257] Fixed incorrect object name in kernel CMakeLists.txt --- kernel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index cacd0f38f..939eef6c7 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -81,7 +81,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "" "gemv_n" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANSA" "gemv_t" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "CONJ" "gemv_r" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "CONJ;TRANSA" "${KERNELDIR}/${${float_char}GEMVUKERNEL}" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "CONJ;TRANSA" "gemv_c" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "XCONJ" "gemv_o" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "XCONJ;TRANSA" "gemv_u" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "XCONJ;CONJ" "gemv_s" false "" "" false ${float_type}) From 0d8e227ea76514ce18db97a544197ec5c49d0c96 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 24 Feb 2015 12:26:33 -0600 Subject: [PATCH 095/257] Changed strategy for setting preprocessor definitions. Instead of generating separate object files for each permutation of defines for a source file, GenerateNamedObjects now writes an entirely new source file and inserts the defines as #define c statements. This solves a problem I ran into with ar.exe where it was refusing to link objects that had the same filename despite having different paths. --- CMakeLists.txt | 16 ++++------------ cmake/utils.cmake | 29 ++++++++++++++++++++--------- driver/level2/CMakeLists.txt | 3 +-- driver/level3/CMakeLists.txt | 3 +-- driver/others/CMakeLists.txt | 11 +---------- interface/CMakeLists.txt | 3 +-- kernel/CMakeLists.txt | 2 +- lapack/CMakeLists.txt | 2 +- 8 files changed, 30 insertions(+), 39 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 85b20b176..530f3dda3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,24 +75,16 @@ if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.") endif () -# Let CMake handle this -#if (${NOFORTRAN}) -# message(ERROR "OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.") -#endif () - if (${NO_STATIC} AND ${NO_SHARED}) message(FATAL_ERROR "Neither static nor shared are enabled.") endif () -set(DBLAS_OBJS "") -foreach (SUBDIR ${SUBDIRS}) - add_subdirectory(${SUBDIR}) -endforeach () - # get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) set(TARGET_OBJS "") -foreach (DBLAS_OBJ ${DBLAS_OBJS}) - list(APPEND TARGET_OBJS "$") +foreach (SUBDIR ${SUBDIRS}) + add_subdirectory(${SUBDIR}) + string(REPLACE "/" "_" subdir_obj ${SUBDIR}) + list(APPEND TARGET_OBJS "$") endforeach () # netlib: diff --git a/cmake/utils.cmake b/cmake/utils.cmake index edf25cdb2..498c3840a 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -13,7 +13,6 @@ function(ParseGetArchVars GETARCH_IN) endfunction () # Reads a Makefile into CMake vars. -# TODO: respect IFDEF/IFNDEF? macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") file(STRINGS ${MAKEFILE_IN} makefile_contents) @@ -215,16 +214,30 @@ function(GenerateNamedObjects sources_in) endif () endif () - add_library(${obj_name} OBJECT ${source_file}) - set_target_properties(${obj_name} PROPERTIES COMPILE_DEFINITIONS "${obj_defines}") + if (VERBOSE_GEN) + message(STATUS "${obj_name}:${source_file}") + message(STATUS "${obj_defines}") + endif () + + # create a copy of the source to avoid duplicate obj filename problem with ar.exe + get_filename_component(source_extension ${source_file} EXT) + set(new_source_file "${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${obj_name}${source_extension}") + if (IS_ABSOLUTE ${source_file}) + set(old_source_file ${source_file}) + else () + set(old_source_file "${CMAKE_CURRENT_LIST_DIR}/${source_file}") + endif () - list(APPEND OBJ_LIST_OUT ${obj_name}) + string(REPLACE ";" "\n#define " define_source "${obj_defines}") + string(REPLACE "=" " " define_source "${define_source}") + file(WRITE ${new_source_file} "#define ${define_source}\n#include \"${old_source_file}\"") + list(APPEND SRC_LIST_OUT ${new_source_file}) endforeach () endforeach () - list(APPEND DBLAS_OBJS ${OBJ_LIST_OUT}) - set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) + list(APPEND OPENBLAS_SRC ${SRC_LIST_OUT}) + set(OPENBLAS_SRC ${OPENBLAS_SRC} PARENT_SCOPE) endfunction () # generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in @@ -260,7 +273,6 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_de set(define_combos ${LIST_OUT}) set(define_codes ${CODES_OUT}) - set(COMBO_OBJ_LIST_OUT "") list(LENGTH define_combos num_combos) math(EXPR num_combos "${num_combos} - 1") @@ -322,10 +334,9 @@ function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_de endif () GenerateNamedObjects("${source_file}" "${cur_defines}" "${alternate_name}" false "${replace_code}" "${append_code}" "${no_float_type}" "${complex_filename_scheme}") - list(APPEND COMBO_OBJ_LIST_OUT "${OBJ_LIST_OUT}") endforeach () endforeach () - set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) + set(OPENBLAS_SRC ${OPENBLAS_SRC} PARENT_SCOPE) endfunction () diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 54e0eb42f..be275724f 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -115,5 +115,4 @@ if (SMP) endif () -set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS - +add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 85bde071d..fac96cc82 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -69,5 +69,4 @@ endforeach () #endif # -set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS - +add_library(driver_level3 OBJECT ${OPENBLAS_SRC}) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 3e17ce5be..938f1daaf 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -62,14 +62,6 @@ endif () #COMMONOBJS += profile.$(SUFFIX) #endif -add_library(COMMON_OBJS OBJECT - ${MEMORY} - ${SMP_SOURCES} - ${COMMON_SOURCES} -) - -list(APPEND DBLAS_OBJS "COMMON_OBJS") - #LIBOTHERS = libothers.$(LIBSUFFIX) #ifeq ($(DYNAMIC_ARCH), 1) @@ -78,5 +70,4 @@ list(APPEND DBLAS_OBJS "COMMON_OBJS") #HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) #endif -set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS - +add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES}) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 1ca554307..ae949235b 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -117,5 +117,4 @@ if (NOT DEFINED NO_LAPACK) GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) endif () -set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS - +add_library(interface OBJECT ${OPENBLAS_SRC}) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 939eef6c7..368af90b2 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -251,4 +251,4 @@ endforeach () # Makefile.LA #DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) -set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS +add_library(kernel OBJECT ${OPENBLAS_SRC}) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 7e17de7de..c8c82219d 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -99,5 +99,5 @@ GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "" 4 "" "" 3) GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3) -set(DBLAS_OBJS ${DBLAS_OBJS} PARENT_SCOPE) # list append removes the scope from DBLAS_OBJS +add_library(lapack OBJECT ${OPENBLAS_SRC}) From 2416d9dbac3dda3ce54cc659da18e4eb9b764989 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 24 Feb 2015 13:18:07 -0600 Subject: [PATCH 096/257] Fixed TRANSA defines for complex sources in driver/level2. --- driver/level2/CMakeLists.txt | 110 ++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 48 deletions(-) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index be275724f..d6179c0fb 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -12,12 +12,17 @@ set(UL_SOURCES ) # sources that need to be compiled several times, for UNIT, TRANSA -set(NU_SOURCES +set(U_SOURCES + trmv_U.c tbmv_U.c tbsv_U.c tpmv_U.c tpsv_U.c trsv_U.c +) + +set(L_SOURCES + trmv_L.c tbmv_L.c tbsv_L.c tpmv_L.c @@ -25,66 +30,60 @@ set(NU_SOURCES trsv_L.c ) +set(UL_SMP_SOURCES + symv_thread.c + syr_thread.c + syr2_thread.c + spr_thread.c + spr2_thread.c + spmv_thread.c + sbmv_thread.c +) + +set(NU_SMP_SOURCES + trmv_thread.c + tpmv_thread.c + tbmv_thread.c +) + # objects that need LOWER set GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "" 1 "" "" 3) -# objects that need TRANSA and UNIT set -# N.B. BLAS wants to put the U/L from the filename in the *MIDDLE* because of course why not have a different naming scheme for every single object -hpa -GenerateCombinationObjects("${NU_SOURCES}" "TRANSA;UNIT" "N;N" "" 3 "" "" 3) - -# gbmv uses a lowercase n and t. WHY? WHO KNOWS! +# gbmv uses a lowercase n and t GenerateNamedObjects("gbmv_k.c" "" "gbmv_n" false "" "" "" 3) GenerateNamedObjects("gbmv_k.c" "TRANS" "gbmv_t" false "" "" "" 3) # special defines for complex trmv foreach (float_type ${FLOAT_TYPES}) - if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") - GenerateCombinationObjects("ztrmv_U.c" "UNIT" "N" "TRANSA=1" 0 "trmv_NU" false ${float_type}) - GenerateCombinationObjects("ztrmv_U.c" "UNIT" "N" "TRANSA=2" 0 "trmv_TL" false ${float_type}) - GenerateCombinationObjects("ztrmv_U.c" "UNIT" "N" "TRANSA=3" 0 "trmv_RU" false ${float_type}) - GenerateCombinationObjects("ztrmv_U.c" "UNIT" "N" "TRANSA=4" 0 "trmv_CL" false ${float_type}) - GenerateCombinationObjects("ztrmv_L.c" "UNIT" "N" "TRANSA=1" 0 "trmv_NL" false ${float_type}) - GenerateCombinationObjects("ztrmv_L.c" "UNIT" "N" "TRANSA=2" 0 "trmv_TU" false ${float_type}) - GenerateCombinationObjects("ztrmv_L.c" "UNIT" "N" "TRANSA=3" 0 "trmv_RL" false ${float_type}) - GenerateCombinationObjects("ztrmv_L.c" "UNIT" "N" "TRANSA=4" 0 "trmv_CU" false ${float_type}) - else () - GenerateCombinationObjects("trmv_U.c;trmv_L.c" "TRANSA;UNIT" "N;N" "" 3 "" false ${float_type}) - endif () -endforeach () - -if (SMP) - - - set(UL_SMP_SOURCES - symv_thread.c - syr_thread.c - syr2_thread.c - spr_thread.c - spr2_thread.c - spmv_thread.c - sbmv_thread.c - ) - - GenerateCombinationObjects("${UL_SMP_SOURCES}" "LOWER" "U" "" 2) - - set(NU_SMP_SOURCES - trmv_thread.c - tpmv_thread.c - tbmv_thread.c - ) - - GenerateCombinationObjects("${NU_SMP_SOURCES}" "TRANSA;LOWER;UNIT" "N;U;N" "" 2) - - foreach (float_type ${FLOAT_TYPES}) + if (SMP) GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false ${float_type}) GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false ${float_type}) GenerateNamedObjects("gbmv_thread.c" "" "gbmv_thread_n" false "" "" false ${float_type}) GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) + endif () + + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + + foreach (u_source ${U_SOURCES}) + string(REGEX MATCH "[a-z]+" op_name ${u_source}) + GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=1" 0 "${op_name}_NU" false ${float_type}) + GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=2" 0 "${op_name}_TL" false ${float_type}) + GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=3" 0 "${op_name}_RU" false ${float_type}) + GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=4" 0 "${op_name}_CL" false ${float_type}) + endforeach () + + foreach (l_source ${L_SOURCES}) + string(REGEX MATCH "[a-z]+" op_name ${l_source}) + GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=1" 0 "${op_name}_NL" false ${float_type}) + GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=2" 0 "${op_name}_TU" false ${float_type}) + GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=3" 0 "${op_name}_RL" false ${float_type}) + GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=4" 0 "${op_name}_CU" false ${float_type}) + endforeach () + if (SMP) - if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateCombinationObjects("trmv_thread.c" "LOWER;UNIT" "U;N" "TRANSA=3" 0 "trmv_thread_R" false ${float_type}) GenerateCombinationObjects("trmv_thread.c" "LOWER;UNIT" "U;N" "TRANSA=4" 0 "trmv_thread_C" false ${float_type}) @@ -107,12 +106,27 @@ if (SMP) GenerateNamedObjects("ger_thread.c" "XCONJ" "ger_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("ger_thread.c" "XCONJ;CONJ" "ger_thread_D" false "" "" false ${float_type}) - else () - GenerateNamedObjects("ger_thread.c" "" "" false "" "" false ${float_type}) + foreach (nu_smp_src ${NU_SMP_SOURCSE}) + string(REGEX MATCH "[a-z]+" op_name ${nu_smp_src}) + GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=1" 0 "${op_name}_N" false ${float_type}) + GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=2" 0 "${op_name}_T" false ${float_type}) + GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=3" 0 "${op_name}_R" false ${float_type}) + GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=4" 0 "${op_name}_C" false ${float_type}) + endforeach () endif () - endforeach () + else () + # N.B. BLAS wants to put the U/L from the filename in the *MIDDLE* + GenerateCombinationObjects("${U_SOURCES};${L_SOURCES}" "TRANSA;UNIT" "N;N" "" 3 "" false ${float_type}) + if (SMP) + GenerateNamedObjects("ger_thread.c" "" "" false "" "" false ${float_type}) + GenerateCombinationObjects("${NU_SMP_SOURCES}" "TRANSA;LOWER;UNIT" "N;U;N" "" 2 "" false ${float_type}) + endif () + endif () +endforeach () +if (SMP) + GenerateCombinationObjects("${UL_SMP_SOURCES}" "LOWER" "U" "" 2) endif () add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) From 0553476fbaf668679c45b9f65c29249f97f2bdae Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 24 Feb 2015 14:30:35 -0600 Subject: [PATCH 097/257] Added TRANS defines for complex sources in lapack. --- driver/level2/CMakeLists.txt | 2 +- lapack/CMakeLists.txt | 35 +++++++++++++++-------------------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index d6179c0fb..d35069cf9 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -53,7 +53,7 @@ GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("gbmv_k.c" "" "gbmv_n" false "" "" "" 3) GenerateNamedObjects("gbmv_k.c" "TRANS" "gbmv_t" false "" "" "" 3) -# special defines for complex trmv +# special defines for complex foreach (float_type ${FLOAT_TYPES}) if (SMP) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index c8c82219d..de42e1ab6 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -38,25 +38,6 @@ set(UNIT_SOURCES2 trti2/trti2_L.c ) -# TODO: getrs needs to be compiled with up to TRANS=4 in the complex case -set(ZLAPACK_SOURCES - getf2/zgetf2_k.c - getrf/getrf_single.c - getrs/zgetrs_single.c - potrf/potrf_U_single.c - potrf/potrf_L_single.c - potf2/potf2_U.c - potf2/potf2_L.c - lauu2/zlauu2_U.c - lauu2/zlauu2_L.c - lauum/lauum_U_single.c - lauum/lauum_L_single.c - trti2/ztrti2_U.c - trti2/ztrti2_L.c - trtri/trtri_U_single.c - trtri/trtri_L_single.c -) - GenerateNamedObjects("${LAPACK_SOURCES}") GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) @@ -95,7 +76,21 @@ if (SMP) GenerateNamedObjects("${PARALLEL_SOURCES}") endif () -GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "" 4 "" "" 3) +foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + foreach (trans_src ${TRANS_SOURCES}) + string(REGEX MATCH "[a-z]/([a-z]+_)([a-z]+)" op_name ${trans_src}) + string(REPLACE "/" "/z" ztrans_src ${trans_src}) + GenerateNamedObjects("${ztrans_src}" "TRANS=1" "${CMAKE_MATCH_1}N_${CMAKE_MATCH_2}" false "" "" false ${float_type}) + GenerateNamedObjects("${ztrans_src}" "TRANS=2" "${CMAKE_MATCH_1}T_${CMAKE_MATCH_2}" false "" "" false ${float_type}) + GenerateNamedObjects("${ztrans_src}" "TRANS=3" "${CMAKE_MATCH_1}R_${CMAKE_MATCH_2}" false "" "" false ${float_type}) + GenerateNamedObjects("${ztrans_src}" "TRANS=4" "${CMAKE_MATCH_1}C_${CMAKE_MATCH_2}" false "" "" false ${float_type}) + endforeach () + else () + GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "" 4 "" false ${float_type}) + endif () +endforeach () + GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3) From a8002b0c5f961856c0f8d8c924418d37474a0032 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 24 Feb 2015 14:31:18 -0600 Subject: [PATCH 098/257] Separated getarch ASM file when using MSVC. --- cmake/prebuild.cmake | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 32faeeea7..d2bada364 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -54,11 +54,20 @@ include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") # compile getarch +set(GETARCH_SRC + ${CMAKE_SOURCE_DIR}/getarch.c + ${CPUIDEMO} +) + +if (NOT MSVC) + list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S) +endif () + set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) try_compile(GETARCH_RESULT ${GETARCH_DIR} - SOURCES ${CMAKE_SOURCE_DIR}/getarch.c ${CMAKE_SOURCE_DIR}/cpuid.S ${CPUIDEMO} + SOURCES ${GETARCH_SRC} COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE GETARCH_LOG COPY_FILE ${GETARCH_BIN} From 504cdb10ed0eab06662be1179b0be141764ee17c Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 24 Feb 2015 14:31:45 -0600 Subject: [PATCH 099/257] Added check for MSVC before enabling fortran. Currently forcing gfortran, instead of assuming ifort. --- CMakeLists.txt | 1 - cmake/f_check.cmake | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 530f3dda3..64d27da42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,6 @@ set(OpenBLAS_MINOR_VERSION 2) set(OpenBLAS_PATCH_VERSION 13) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") -enable_language(Fortran) enable_language(ASM) message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index 266cdbb2a..f7651db56 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -20,6 +20,13 @@ # NEEDBUNDERSCORE # NEED2UNDERSCORES +if (MSVC) + # had to do this for MSVC, else CMake automatically assumes I have ifort... -hpa + include(CMakeForceCompiler) + CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) +endif () +enable_language(Fortran) + if (NOT ONLY_CBLAS) # N.B. f_check is not cross-platform, so instead try to use CMake variables # run f_check (appends to TARGET files) From ab7043373fa1b6993439ca1b14f14bb8967198dd Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Tue, 24 Feb 2015 15:18:16 -0600 Subject: [PATCH 100/257] Fixed bug generating trmv complex source names. --- driver/level2/CMakeLists.txt | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index d35069cf9..e4440be6d 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -84,9 +84,6 @@ foreach (float_type ${FLOAT_TYPES}) if (SMP) - GenerateCombinationObjects("trmv_thread.c" "LOWER;UNIT" "U;N" "TRANSA=3" 0 "trmv_thread_R" false ${float_type}) - GenerateCombinationObjects("trmv_thread.c" "LOWER;UNIT" "U;N" "TRANSA=4" 0 "trmv_thread_C" false ${float_type}) - GenerateNamedObjects("gemv_thread.c" "CONJ" "gemv_thread_r" false "" "" false ${float_type}) GenerateNamedObjects("gemv_thread.c" "CONJ;TRANSA" "gemv_thread_c" false "" "" false ${float_type}) GenerateNamedObjects("gemv_thread.c" "XCONJ" "gemv_thread_o" false "" "" false ${float_type}) @@ -106,8 +103,8 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("ger_thread.c" "XCONJ" "ger_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("ger_thread.c" "XCONJ;CONJ" "ger_thread_D" false "" "" false ${float_type}) - foreach (nu_smp_src ${NU_SMP_SOURCSE}) - string(REGEX MATCH "[a-z]+" op_name ${nu_smp_src}) + foreach (nu_smp_src ${NU_SMP_SOURCES}) + string(REGEX MATCH "[a-z]+_[a-z]+" op_name ${nu_smp_src}) GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=1" 0 "${op_name}_N" false ${float_type}) GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=2" 0 "${op_name}_T" false ${float_type}) GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=3" 0 "${op_name}_R" false ${float_type}) From 9eaea02f33a52443814be07ed533420715f57698 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 25 Feb 2015 09:39:11 -0600 Subject: [PATCH 101/257] Added additional gemm defines for complex types. --- driver/level3/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index fac96cc82..376a0beeb 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -12,6 +12,7 @@ endif () # loop through gemm.c defines set(GEMM_DEFINES NN NT TN TT) +set(GEMM_COMPLEX_DEFINES RN CN RT CT NR TR RR CR NC TC RC CC) foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0) @@ -53,6 +54,14 @@ foreach (float_type ${FLOAT_TYPES}) GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trsm_LC" false ${float_type}) GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trsm_RR" false ${float_type}) GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trsm_RC" false ${float_type}) + # special gemm defines for complex + foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) + string(TOLOWER ${gemm_define} gemm_define_LC) + GenerateNamedObjects("gemm.c" "${gemm_define}" "gemm_${gemm_define_LC}" false "" "" false ${float_type}) + if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${gemm_define};THREADED_LEVEL3" "gemm_thread_${gemm_define_LC}" false "" "" false ${float_type}) + endif () + endforeach () endif () endforeach () From 00e373aea6c4f8f11797d230bba0ce49f573b191 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 25 Feb 2015 10:18:18 -0600 Subject: [PATCH 102/257] Added LAPACK sources directly to add_library call instead of OBJECT. --- CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 64d27da42..cbc4fb9a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,12 +91,10 @@ endforeach () # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") -add_library(LA_OBJ OBJECT ${LA_SOURCES}) -set_target_properties(LA_OBJ PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") -list(APPEND TARGET_OBJS "$") +set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") # add objects to the openblas lib -add_library(openblas ${TARGET_OBJS}) +add_library(openblas ${LA_SOURCES} ${TARGET_OBJS}) # TODO: Why is the config saved here? Is this necessary with CMake? #Save the config files for installation From 518e2424a8b14f314a850b297c867f9f18bd622e Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 25 Feb 2015 11:51:29 -0600 Subject: [PATCH 103/257] Fixed bad filename for cpuid.S compile. --- kernel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 368af90b2..4fe27a7d0 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -20,7 +20,7 @@ ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") ParseMakefileVars("${KERNELDIR}/KERNEL") if (${ARCH} STREQUAL "x86") - GenerateNamedObjects("${KERNELDIR}/cpuid.S" "") + GenerateNamedObjects("${KERNELDIR}/cpuid.S" "" "" false "" "" true) endif () # don't use float type name mangling here From 84d90d6ed85853eecb3ea17f1f23d3a5d7e8d264 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 25 Feb 2015 11:52:25 -0600 Subject: [PATCH 104/257] Fixed some compiler errors/warnings for clang. --- common.h | 2 +- cpuid_x86.c | 4 ++-- driver/others/memory.c | 4 ++-- kernel/arm/zdot.c | 6 +++++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/common.h b/common.h index 0761f5536..0a7bb66bc 100644 --- a/common.h +++ b/common.h @@ -310,7 +310,7 @@ typedef int blasint; #endif #if defined(OS_WINDOWS) -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) #define YIELDING YieldProcessor() #else #define YIELDING SwitchToThread() diff --git a/cpuid_x86.c b/cpuid_x86.c index 6b7e408d8..8a8a802a0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -40,7 +40,7 @@ #include #include "cpuid.h" -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) #define C_INLINE __inline #else #define C_INLINE inline @@ -154,7 +154,7 @@ static C_INLINE int have_excpuid(void){ #ifndef NO_AVX static C_INLINE void xgetbv(int op, int * eax, int * edx){ //Use binary code for xgetbv -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) *eax = __xgetbv(op); #else __asm__ __volatile__ diff --git a/driver/others/memory.c b/driver/others/memory.c index 16d68cced..fa364785b 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -769,12 +769,12 @@ static void *alloc_hugetlb(void *address){ if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { CloseHandle(hToken); - return -1; + return (void*)-1; } if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) { CloseHandle(hToken); - return -1; + return (void*)-1; } map_address = (void *)VirtualAlloc(address, diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 469487531..198104022 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -35,9 +35,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************/ #include "common.h" -#include +#ifndef _MSC_VER +#include FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +openblas_complex_double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif { BLASLONG i=0; BLASLONG ix=0,iy=0; From 5ae8993752886033161ef74184f333a2401c8ba9 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 25 Feb 2015 11:52:51 -0600 Subject: [PATCH 105/257] Added intrinsics for MSVC. --- common_x86.h | 29 +++++++++++++++++++++++++++++ driver/others/memory.c | 27 +++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/common_x86.h b/common_x86.h index f096e9074..0cb242c4e 100644 --- a/common_x86.h +++ b/common_x86.h @@ -56,41 +56,65 @@ static void __inline blas_lock(volatile BLASULONG *address){ do { while (*address) {YIELDING;}; +#if defined(_MSC_VER) && !defined(__clang__) + // use intrinsic instead of inline assembly + ret = _InterlockedExchange(address, 1); + // inline assembly + /*__asm { + mov eax, address + mov ebx, 1 + xchg [eax], ebx + mov ret, ebx + }*/ +#else __asm__ __volatile__( "xchgl %0, %1\n" : "=r"(ret), "=m"(*address) : "0"(1), "m"(*address) : "memory"); +#endif } while (ret); } static __inline unsigned long long rpcc(void){ +#if defined(_MSC_VER) && !defined(__clang__) + return __rdtsc(); // use MSVC intrinsic +#else unsigned int a, d; __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); return ((unsigned long long)a + ((unsigned long long)d << 32)); +#endif }; static __inline unsigned long getstackaddr(void){ +#if defined(_MSC_VER) && !defined(__clang__) + return (unsigned long)_ReturnAddress(); // use MSVC intrinsic +#else unsigned long addr; __asm__ __volatile__ ("mov %%esp, %0" : "=r"(addr) : : "memory"); return addr; +#endif }; static __inline long double sqrt_long(long double val) { +#if defined(_MSC_VER) && !defined(__clang__) + return sqrt(val); // not sure if this will use fsqrt +#else long double result; __asm__ __volatile__ ("fldt %1\n" "fsqrt\n" "fstpt %0\n" : "=m" (result) : "m"(val)); return result; +#endif } #define SQRT(a) sqrt_long(a) @@ -146,9 +170,14 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; +#if defined(_MSC_VER) && !defined(__clang__) + (void*)result; + return x*y; +#else __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); return result; +#endif } #endif diff --git a/driver/others/memory.c b/driver/others/memory.c index fa364785b..70bfa7a57 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -137,8 +137,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) +#if defined(_MSC_VER) && !defined(__clang__) +#define CONSTRUCTOR __cdecl +#define DESTRUCTOR __cdecl +#else #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) +#endif #ifdef DYNAMIC_ARCH gotoblas_t *gotoblas = NULL; @@ -1360,6 +1365,28 @@ void DESTRUCTOR gotoblas_quit(void) { blas_shutdown(); } +#if defined(_MSC_VER) && !defined(__clang__) +BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) +{ + switch (ul_reason_for_call) + { + case DLL_PROCESS_ATTACH: + gotoblas_init(); + break; + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: + break; + case DLL_PROCESS_DETACH: + gotoblas_quit(); + break; + default: + break; + } + return TRUE; +} +#endif + #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) /* Don't call me; this is just work around for PGI / Sun bug */ void gotoblas_dummy_for_PGI(void) { From 3649cfbd7be57028c83eea956f3c13d5fb403756 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 25 Feb 2015 12:23:26 -0600 Subject: [PATCH 106/257] Fixed EPILOGUE for clang. --- common_x86.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common_x86.h b/common_x86.h index 0cb242c4e..9d2df41b8 100644 --- a/common_x86.h +++ b/common_x86.h @@ -313,8 +313,12 @@ REALNAME: #define PROFCODE +#ifdef __clang__ +#define EPILOGUE .end +#else #define EPILOGUE .end REALNAME #endif +#endif #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) #define PROLOGUE \ From e19bf3a28bf13d9c0e1306ba07c41aa167561579 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 25 Feb 2015 14:44:49 -0600 Subject: [PATCH 107/257] Removed MSVC cpuid func when using clang. --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 8a8a802a0..c85e9424d 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -59,7 +59,7 @@ #endif */ -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) { From 1d183dcda8332ecadbd4aa0c59232d14651ffe52 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Wed, 25 Feb 2015 16:51:08 -0600 Subject: [PATCH 108/257] Added lapacke sources. --- CMakeLists.txt | 10 +++++++--- cmake/lapack.cmake | 2 +- cmake/prebuild.cmake | 5 ++++- cmake/system.cmake | 10 ++++++---- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cbc4fb9a1..5c2681141 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,11 +90,15 @@ endforeach () # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. -include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") -set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") +if (NOT NOFORTRAN) + include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") +endif () +if (NOT NO_LAPACKE) + include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake") +endif () # add objects to the openblas lib -add_library(openblas ${LA_SOURCES} ${TARGET_OBJS}) +add_library(openblas ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS}) # TODO: Why is the config saved here? Is this necessary with CMake? #Save the config files for installation diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index e8d19f10d..3e81611ab 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -344,4 +344,4 @@ set(LA_SOURCES "") foreach (LA_FILE ${LA_REL_SRC}) list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}") endforeach () - +set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index d2bada364..3e2574f77 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -51,7 +51,10 @@ else() endif () include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") -include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") + +if (NOT NOFORTRAN) + include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") +endif () # compile getarch set(GETARCH_SRC diff --git a/cmake/system.cmake b/cmake/system.cmake index cc7373e47..36f9b7cbd 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -132,8 +132,10 @@ include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake") # C Compiler dependent settings include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake") -# Fortran Compiler dependent settings -include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake") +if (NOT NOFORTRAN) + # Fortran Compiler dependent settings + include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake") +endif () if (BINARY64) if (INTERFACE64) @@ -342,7 +344,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") endforeach () endif () -if (${F_COMPILER} STREQUAL "GFORTRAN") +if ("${F_COMPILER}" STREQUAL "GFORTRAN") # lapack-netlib is rife with uninitialized warnings -hpa set(LAPACK_FFLAGS "${LAPACK_FFLAGS} -Wno-maybe-uninitialized") endif () @@ -356,7 +358,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DOPENBLAS_OS_WINDOWS") endif () -if (${CMAKE_C_COMPILER} STREQUAL "LSB") +if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") endif () From e5b96e55a727bee4010a2ae4c96c6f71327ba96b Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 24 Mar 2015 15:27:17 -0500 Subject: [PATCH 109/257] Fix build bug for ARM64. --- kernel/arm64/KERNEL.XGENE1 | 1 + 1 file changed, 1 insertion(+) create mode 100644 kernel/arm64/KERNEL.XGENE1 diff --git a/kernel/arm64/KERNEL.XGENE1 b/kernel/arm64/KERNEL.XGENE1 new file mode 100644 index 000000000..6ee0c730c --- /dev/null +++ b/kernel/arm64/KERNEL.XGENE1 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV8 \ No newline at end of file From 0ac787eefe7cf1e19e990d7ef5f94a91b5a34317 Mon Sep 17 00:00:00 2001 From: xantares Date: Mon, 30 Mar 2015 09:30:55 +0200 Subject: [PATCH 110/257] fix mingw install --- Makefile.install | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.install b/Makefile.install index e1deaae3e..a5814e55a 100644 --- a/Makefile.install +++ b/Makefile.install @@ -86,8 +86,8 @@ ifeq ($(OSNAME), Darwin) ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) - @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) - @-cp $(LIBDLLNAME).a $(OPENBLAS_LIBRARY_DIR) + @-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) endif ifeq ($(OSNAME), CYGWIN_NT) @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) From 8977b3f2357c6dcc1e93812a0bf902d0aa37da65 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 2 Apr 2015 11:08:03 -0500 Subject: [PATCH 111/257] Refs #529. Support Intel Broadwell by Haswell kernels. --- cpuid_x86.c | 70 +++++++++++++++++++++++++++++++++++++++++ driver/others/dynamic.c | 29 +++++++++++++++++ 2 files changed, 99 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index ef90b26d8..aece9d871 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1098,6 +1098,16 @@ int get_cpuname(void){ return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + case 13: + //Broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; @@ -1112,11 +1122,36 @@ int get_cpuname(void){ return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + case 7: + case 15: + //Broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; } break; + case 5: + switch (model) { + case 6: + //Broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + } + break; } break; case 0x7: @@ -1525,6 +1560,16 @@ int get_coretype(void){ return CORE_HASWELL; #else return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + case 13: + //broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; @@ -1539,11 +1584,36 @@ int get_coretype(void){ return CORE_HASWELL; #else return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + case 7: + case 15: + //broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; } break; + case 5: + switch (model) { + case 6: + //broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + } + break; } break; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 60b3c72af..6945c17d4 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -221,6 +221,15 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + //Intel Broadwell + if (model == 13) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; case 4: //Intel Haswell @@ -232,6 +241,26 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + //Intel Broadwell + if (model == 7 || model == 15) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; + case 5: + //Intel Broadwell + if (model == 6) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: From 701b9d7556d798f94d9e5c30246787892bb22452 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 5 Apr 2015 17:57:53 +0200 Subject: [PATCH 112/257] added optimized sdot- and ddot-kernel for HASWELL --- kernel/x86_64/KERNEL.HASWELL | 4 ++ kernel/x86_64/ddot.c | 7 +- kernel/x86_64/ddot_microk_haswell-2.c | 95 ++++++++++++++++++++++++++ kernel/x86_64/sdot.c | 4 +- kernel/x86_64/sdot_microk_haswell-2.c | 96 +++++++++++++++++++++++++++ 5 files changed, 202 insertions(+), 4 deletions(-) create mode 100644 kernel/x86_64/ddot_microk_haswell-2.c create mode 100644 kernel/x86_64/sdot_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index a621b4484..c8fb7da24 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -10,6 +10,10 @@ ZGEMVTKERNEL = zgemv_t_4.c CGEMVNKERNEL = cgemv_n_4.c CGEMVTKERNEL = cgemv_t_4.c +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c + + SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index d501c2f68..9eda0fc44 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "ddot_microk_bulldozer-2.c" -#elif defined(NEHALEM) +#elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "ddot_microk_haswell-2.c" #endif @@ -75,12 +77,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -8; + int n1 = n & -16; if ( n1 ) ddot_kernel_8(n1, x, y , &dot ); - i = n1; while(i < n) { diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c new file mode 100644 index 000000000..d36577af3 --- /dev/null +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -0,0 +1,95 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%ymm12 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 2 * x + "vmovups 64(%2,%0,8), %%ymm14 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm15 \n\t" // 2 * x + + "vfmadd231pd (%3,%0,8), %%ymm12, %%ymm4 \n\t" // 2 * y + "vfmadd231pd 32(%3,%0,8), %%ymm13, %%ymm5 \n\t" // 2 * y + "vfmadd231pd 64(%3,%0,8), %%ymm14, %%ymm6 \n\t" // 2 * y + "vfmadd231pd 96(%3,%0,8), %%ymm15, %%ymm7 \n\t" // 2 * y + + "addq $16 , %0 \n\t" + "subq $16 , %1 \n\t" + "jnz 1b \n\t" + + "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" + + "vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 6fec48175..222bc358a 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "sdot_microk_haswell-2.c" #endif @@ -74,7 +76,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -16; + int n1 = n & -32; if ( n1 ) sdot_kernel_16(n1, x, y , &dot ); diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c new file mode 100644 index 000000000..4051f9c1b --- /dev/null +++ b/kernel/x86_64/sdot_microk_haswell-2.c @@ -0,0 +1,96 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%ymm12 \n\t" // 2 * x + "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 2 * x + "vmovups 64(%2,%0,4), %%ymm14 \n\t" // 2 * x + "vmovups 96(%2,%0,4), %%ymm15 \n\t" // 2 * x + + "vfmadd231ps (%3,%0,4), %%ymm12, %%ymm4 \n\t" // 2 * y + "vfmadd231ps 32(%3,%0,4), %%ymm13, %%ymm5 \n\t" // 2 * y + "vfmadd231ps 64(%3,%0,4), %%ymm14, %%ymm6 \n\t" // 2 * y + "vfmadd231ps 96(%3,%0,4), %%ymm15, %%ymm7 \n\t" // 2 * y + + "addq $32 , %0 \n\t" + "subq $32 , %1 \n\t" + "jnz 1b \n\t" + + "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" + + "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovss %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 9707d608d570bf13497fd08ea4632b34d2d48cfb Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 5 Apr 2015 18:35:34 +0200 Subject: [PATCH 113/257] removed double definition line --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 289529772..791c18146 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -3,7 +3,7 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c -DDOTKERNEL = ddot.c +#DDOTKERNEL = ddot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c From 3937e2a0a0954e01c24576dc3ca46d4e2f69fac1 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 5 Apr 2015 19:47:05 +0200 Subject: [PATCH 114/257] add optimized sdot-kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 2 + kernel/x86_64/sdot.c | 2 + kernel/x86_64/sdot_microk_sandy-2.c | 101 ++++++++++++++++++++++++++++ lapack-netlib/TESTING/sep.in | 2 +- 4 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sdot_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index ff96cd011..ac41d2772 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -3,6 +3,8 @@ SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 222bc358a..edce81b89 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -34,6 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_nehalem-2.c" #elif defined(HASWELL) #include "sdot_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "sdot_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c new file mode 100644 index 000000000..e265d16bd --- /dev/null +++ b/kernel/x86_64/sdot_microk_sandy-2.c @@ -0,0 +1,101 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%ymm12 \n\t" // 2 * x + "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 2 * x + "vmovups 64(%2,%0,4), %%ymm14 \n\t" // 2 * x + "vmovups 96(%2,%0,4), %%ymm15 \n\t" // 2 * x + + "vmulps (%3,%0,4), %%ymm12, %%ymm12 \n\t" // 2 * y + "vmulps 32(%3,%0,4), %%ymm13, %%ymm13 \n\t" // 2 * y + "vmulps 64(%3,%0,4), %%ymm14, %%ymm14 \n\t" // 2 * y + "vmulps 96(%3,%0,4), %%ymm15, %%ymm15 \n\t" // 2 * y + + "vaddps %%ymm4 , %%ymm12, %%ymm4 \n\t" // 2 * y + "vaddps %%ymm5 , %%ymm13, %%ymm5 \n\t" // 2 * y + "vaddps %%ymm6 , %%ymm14, %%ymm6 \n\t" // 2 * y + "vaddps %%ymm7 , %%ymm15, %%ymm7 \n\t" // 2 * y + + "addq $32 , %0 \n\t" + "subq $32 , %1 \n\t" + "jnz 1b \n\t" + + "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" + + "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovss %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/lapack-netlib/TESTING/sep.in b/lapack-netlib/TESTING/sep.in index 19bd7c3da..c71a754c7 100644 --- a/lapack-netlib/TESTING/sep.in +++ b/lapack-netlib/TESTING/sep.in @@ -5,7 +5,7 @@ SEP: Data file for testing Symmetric Eigenvalue Problem routines 1 3 3 3 10 Values of NB (blocksize) 2 2 2 2 2 Values of NBMIN (minimum blocksize) 1 0 5 9 1 Values of NX (crossover point) -160.0 Threshold value +170.0 Threshold value T Put T to test the LAPACK routines T Put T to test the driver routines T Put T to test the error exits From a901b065d333ed47ed72370fb55908e29e121a87 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 5 Apr 2015 20:19:38 +0200 Subject: [PATCH 115/257] added optimized ddot-kernel for sandybridge --- kernel/x86_64/ddot.c | 2 + kernel/x86_64/ddot_microk_sandy-2.c | 100 ++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 kernel/x86_64/ddot_microk_sandy-2.c diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 9eda0fc44..f857c2f35 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) #include "ddot_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "ddot_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c new file mode 100644 index 000000000..e2e6701c7 --- /dev/null +++ b/kernel/x86_64/ddot_microk_sandy-2.c @@ -0,0 +1,100 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%ymm12 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 2 * x + "vmovups 64(%2,%0,8), %%ymm14 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm15 \n\t" // 2 * x + + "vmulpd (%3,%0,8), %%ymm12, %%ymm12 \n\t" // 2 * y + "vmulpd 32(%3,%0,8), %%ymm13, %%ymm13 \n\t" // 2 * y + "vmulpd 64(%3,%0,8), %%ymm14, %%ymm14 \n\t" // 2 * y + "vmulpd 96(%3,%0,8), %%ymm15, %%ymm15 \n\t" // 2 * y + + "vaddpd %%ymm4 , %%ymm12, %%ymm4 \n\t" // 2 * y + "vaddpd %%ymm5 , %%ymm13, %%ymm5 \n\t" // 2 * y + "vaddpd %%ymm6 , %%ymm14, %%ymm6 \n\t" // 2 * y + "vaddpd %%ymm7 , %%ymm15, %%ymm7 \n\t" // 2 * y + + "addq $16 , %0 \n\t" + "subq $16 , %1 \n\t" + "jnz 1b \n\t" + + "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" + + "vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 53bb9242873919a6f0951bbef06eceafa27f8408 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 6 Apr 2015 12:33:16 +0200 Subject: [PATCH 116/257] added optimized saxpy- and daxpy-kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 + kernel/x86_64/daxpy.c | 4 +- kernel/x86_64/daxpy_microk_haswell-2.c | 78 ++++++++++++++++++++++++++ kernel/x86_64/saxpy.c | 4 +- kernel/x86_64/saxpy_microk_haswell-2.c | 78 ++++++++++++++++++++++++++ 5 files changed, 164 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/daxpy_microk_haswell-2.c create mode 100644 kernel/x86_64/saxpy_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index c8fb7da24..ea3d95872 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -13,6 +13,8 @@ CGEMVTKERNEL = cgemv_t_4.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c +DAXPYKERNEL = daxpy.c +SAXPYKERNEL = saxpy.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index fd5343eba..753b6b445 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -33,6 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_nehalem-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "daxpy_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "daxpy_microk_haswell-2.c" #endif @@ -71,7 +73,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -8; + int n1 = n & -16; if ( n1 ) daxpy_kernel_8(n1, x, y , &da ); diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c new file mode 100644 index 000000000..db117a8ba --- /dev/null +++ b/kernel/x86_64/daxpy_microk_haswell-2.c @@ -0,0 +1,78 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastsd (%4), %%ymm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%ymm12 \n\t" // 4 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 4 * y + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 4 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 4 * y + "vfmadd231pd (%2,%0,8), %%ymm0 , %%ymm12 \n\t" // y += alpha * x + "vfmadd231pd 32(%2,%0,8), %%ymm0 , %%ymm13 \n\t" // y += alpha * x + "vfmadd231pd 64(%2,%0,8), %%ymm0 , %%ymm14 \n\t" // y += alpha * x + "vfmadd231pd 96(%2,%0,8), %%ymm0 , %%ymm15 \n\t" // y += alpha * x + "vmovups %%ymm12, (%3,%0,8) \n\t" + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + "vmovups %%ymm14, 64(%3,%0,8) \n\t" + "vmovups %%ymm15, 96(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index da81f1354..ea83ddbaa 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "saxpy_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "saxpy_microk_haswell-2.c" #endif @@ -69,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -16; + int n1 = n & -32; if ( n1 ) saxpy_kernel_16(n1, x, y , &da ); diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c new file mode 100644 index 000000000..496424207 --- /dev/null +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -0,0 +1,78 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastss (%4), %%ymm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%ymm12 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 8 * y + "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 8 * y + "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 8 * y + "vfmadd231ps (%2,%0,4), %%ymm0 , %%ymm12 \n\t" // y += alpha * x + "vfmadd231ps 32(%2,%0,4), %%ymm0 , %%ymm13 \n\t" // y += alpha * x + "vfmadd231ps 64(%2,%0,4), %%ymm0 , %%ymm14 \n\t" // y += alpha * x + "vfmadd231ps 96(%2,%0,4), %%ymm0 , %%ymm15 \n\t" // y += alpha * x + "vmovups %%ymm12, (%3,%0,4) \n\t" + "vmovups %%ymm13, 32(%3,%0,4) \n\t" + "vmovups %%ymm14, 64(%3,%0,4) \n\t" + "vmovups %%ymm15, 96(%3,%0,4) \n\t" + + "addq $32, %0 \n\t" + "subq $32, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 47898cca352d4d9448c13acce3870fd2c2390013 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 6 Apr 2015 16:05:16 +0200 Subject: [PATCH 117/257] added optimized saxpy- and daxpy-kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 3 + kernel/x86_64/daxpy.c | 6 ++ kernel/x86_64/daxpy_microk_sandy-2.c | 100 +++++++++++++++++++++++++++ kernel/x86_64/saxpy.c | 6 ++ kernel/x86_64/saxpy_microk_sandy-2.c | 100 +++++++++++++++++++++++++++ 5 files changed, 215 insertions(+) create mode 100644 kernel/x86_64/daxpy_microk_sandy-2.c create mode 100644 kernel/x86_64/saxpy_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index ac41d2772..b0b6c6c84 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -6,6 +6,9 @@ ZGEMVNKERNEL = zgemv_n_4.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c + SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 753b6b445..c07b5ca15 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_bulldozer-2.c" #elif defined(HASWELL) #include "daxpy_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "daxpy_microk_sandy-2.c" #endif @@ -73,7 +75,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { +#if defined(SANDYBRIDGE) + int n1 = n & -32; +#else int n1 = n & -16; +#endif if ( n1 ) daxpy_kernel_8(n1, x, y , &da ); diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c new file mode 100644 index 000000000..963ad322d --- /dev/null +++ b/kernel/x86_64/daxpy_microk_sandy-2.c @@ -0,0 +1,100 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastsd (%4), %%ymm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%3,%0,8), %%ymm8 \n\t" + "vmovups 32(%3,%0,8), %%ymm9 \n\t" + "vmovups 64(%3,%0,8), %%ymm10 \n\t" + "vmovups 96(%3,%0,8), %%ymm11 \n\t" + "vmovups 128(%3,%0,8), %%ymm12 \n\t" + "vmovups 160(%3,%0,8), %%ymm13 \n\t" + "vmovups 192(%3,%0,8), %%ymm14 \n\t" + "vmovups 224(%3,%0,8), %%ymm15 \n\t" + + "vmulpd (%2,%0,8), %%ymm0, %%ymm1 \n\t" + "vmulpd 32(%2,%0,8), %%ymm0, %%ymm2 \n\t" + "vaddpd %%ymm8 , %%ymm1, %%ymm8 \n\t" + "vmulpd 64(%2,%0,8), %%ymm0, %%ymm3 \n\t" + "vaddpd %%ymm9 , %%ymm2, %%ymm9 \n\t" + "vmulpd 96(%2,%0,8), %%ymm0, %%ymm4 \n\t" + "vaddpd %%ymm10, %%ymm3, %%ymm10 \n\t" + "vmulpd 128(%2,%0,8), %%ymm0, %%ymm5 \n\t" + "vaddpd %%ymm11, %%ymm4, %%ymm11 \n\t" + "vmulpd 160(%2,%0,8), %%ymm0, %%ymm6 \n\t" + "vaddpd %%ymm12, %%ymm5, %%ymm12 \n\t" + "vmulpd 192(%2,%0,8), %%ymm0, %%ymm7 \n\t" + "vmulpd 224(%2,%0,8), %%ymm0, %%ymm1 \n\t" + + "vaddpd %%ymm13, %%ymm6, %%ymm13 \n\t" + "vmovups %%ymm8 , (%3,%0,8) \n\t" + "vaddpd %%ymm14, %%ymm7, %%ymm14 \n\t" + "vmovups %%ymm9 , 32(%3,%0,8) \n\t" + "vaddpd %%ymm15, %%ymm1, %%ymm15 \n\t" + "vmovups %%ymm10, 64(%3,%0,8) \n\t" + "vmovups %%ymm11, 96(%3,%0,8) \n\t" + "vmovups %%ymm12,128(%3,%0,8) \n\t" + "vmovups %%ymm13,160(%3,%0,8) \n\t" + "vmovups %%ymm14,192(%3,%0,8) \n\t" + "vmovups %%ymm15,224(%3,%0,8) \n\t" + + "addq $32, %0 \n\t" + "subq $32, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index ea83ddbaa..bb24d3cf5 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -33,6 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_nehalem-2.c" #elif defined(HASWELL) #include "saxpy_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "saxpy_microk_sandy-2.c" #endif @@ -71,7 +73,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { +#if defined(SANDYBRIDGE) + int n1 = n & -64; +#else int n1 = n & -32; +#endif if ( n1 ) saxpy_kernel_16(n1, x, y , &da ); diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c new file mode 100644 index 000000000..8a4392d37 --- /dev/null +++ b/kernel/x86_64/saxpy_microk_sandy-2.c @@ -0,0 +1,100 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastss (%4), %%ymm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%3,%0,4), %%ymm8 \n\t" + "vmovups 32(%3,%0,4), %%ymm9 \n\t" + "vmovups 64(%3,%0,4), %%ymm10 \n\t" + "vmovups 96(%3,%0,4), %%ymm11 \n\t" + "vmovups 128(%3,%0,4), %%ymm12 \n\t" + "vmovups 160(%3,%0,4), %%ymm13 \n\t" + "vmovups 192(%3,%0,4), %%ymm14 \n\t" + "vmovups 224(%3,%0,4), %%ymm15 \n\t" + + "vmulps (%2,%0,4), %%ymm0, %%ymm1 \n\t" + "vmulps 32(%2,%0,4), %%ymm0, %%ymm2 \n\t" + "vaddps %%ymm8 , %%ymm1, %%ymm8 \n\t" + "vmulps 64(%2,%0,4), %%ymm0, %%ymm3 \n\t" + "vaddps %%ymm9 , %%ymm2, %%ymm9 \n\t" + "vmulps 96(%2,%0,4), %%ymm0, %%ymm4 \n\t" + "vaddps %%ymm10, %%ymm3, %%ymm10 \n\t" + "vmulps 128(%2,%0,4), %%ymm0, %%ymm5 \n\t" + "vaddps %%ymm11, %%ymm4, %%ymm11 \n\t" + "vmulps 160(%2,%0,4), %%ymm0, %%ymm6 \n\t" + "vaddps %%ymm12, %%ymm5, %%ymm12 \n\t" + "vmulps 192(%2,%0,4), %%ymm0, %%ymm7 \n\t" + "vmulps 224(%2,%0,4), %%ymm0, %%ymm1 \n\t" + + "vaddps %%ymm13, %%ymm6, %%ymm13 \n\t" + "vmovups %%ymm8 , (%3,%0,4) \n\t" + "vaddps %%ymm14, %%ymm7, %%ymm14 \n\t" + "vmovups %%ymm9 , 32(%3,%0,4) \n\t" + "vaddps %%ymm15, %%ymm1, %%ymm15 \n\t" + "vmovups %%ymm10, 64(%3,%0,4) \n\t" + "vmovups %%ymm11, 96(%3,%0,4) \n\t" + "vmovups %%ymm12,128(%3,%0,4) \n\t" + "vmovups %%ymm13,160(%3,%0,4) \n\t" + "vmovups %%ymm14,192(%3,%0,4) \n\t" + "vmovups %%ymm15,224(%3,%0,4) \n\t" + + "addq $64, %0 \n\t" + "subq $64, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 60c6dec6e629a9d79c4ce97ce4662848ae856bea Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 6 Apr 2015 18:47:16 +0200 Subject: [PATCH 118/257] updated some lines for bulldozer --- kernel/x86_64/KERNEL.PILEDRIVER | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 55285e3d3..ec70253b8 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -1,3 +1,7 @@ +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c @@ -7,6 +11,7 @@ ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S +SDOTKERNEL = sdot.c DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S From ba926e807c6b384ab92a560f9962e001bdfcec6e Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 7 Apr 2015 11:56:06 +0200 Subject: [PATCH 119/257] added cdot- and zdot benchmark --- benchmark/Makefile | 48 +++++++++- benchmark/zdot-intel.c | 196 +++++++++++++++++++++++++++++++++++++++++ benchmark/zdot.c | 195 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 435 insertions(+), 4 deletions(-) create mode 100644 benchmark/zdot-intel.c create mode 100644 benchmark/zdot.c diff --git a/benchmark/Makefile b/benchmark/Makefile index b5eaa9343..f76c56a26 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -10,7 +10,7 @@ include $(TOPDIR)/Makefile.system #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm # ACML 6.1 custom -ACML=/home/werner/project/acml6.1/gfortran64_mp/lib +ACML=/home/saar/acml6.1/gfortran64_mp/lib LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm @@ -40,7 +40,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto \ - sdot.goto ddot.goto \ + sdot.goto ddot.goto cdot.goto zdot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ @@ -61,7 +61,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml \ - sdot.acml ddot.acml \ + sdot.acml ddot.acml cdot.acml zdot.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ @@ -104,7 +104,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl \ - sdot.mkl ddot.mkl \ + sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ @@ -998,6 +998,32 @@ ddot.atlas : ddot.$(SUFFIX) ddot.mkl : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +cdot.acml : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.atlas : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.mkl : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zdot.acml : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.atlas : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.mkl : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1264,6 +1290,20 @@ sdot.$(SUFFIX) : dot.c ddot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +cdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + saxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/zdot-intel.c b/benchmark/zdot-intel.c new file mode 100644 index 000000000..bb2c40f38 --- /dev/null +++ b/benchmark/zdot-intel.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#define RETURN_BY_STACK 1 +#include "common.h" + + +#undef DOT + + +#ifdef DOUBLE +#define DOT BLASFUNC(zdotu) +#else +#define DOT BLASFUNC(cdotu) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT _Complex result; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef DOT + + +#ifdef DOUBLE +#define DOT BLASFUNC(zdotu) +#else +#define DOT BLASFUNC(cdotu) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT _Complex result; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Wed, 8 Apr 2015 03:55:49 +0800 Subject: [PATCH 120/257] Refs #535. Fix the wrong vector instruction in sgemm sandy bridge kernel. --- kernel/x86_64/sgemm_kernel_16x4_sandy.S | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_sandy.S b/kernel/x86_64/sgemm_kernel_16x4_sandy.S index 77c7e2f50..ea15cd87e 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_sandy.S +++ b/kernel/x86_64/sgemm_kernel_16x4_sandy.S @@ -328,17 +328,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(TRMMKERNEL) - vaddps (CO1), %xmm4,%xmm4 - vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - vaddps (CO2), %xmm8,%xmm8 - vaddps 1 * SIZE(CO2), %xmm9,%xmm9 + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 - vaddps (CO2, LDC), %xmm10,%xmm10 - vaddps 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 #endif @@ -389,10 +389,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(TRMMKERNEL) - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO2), %xmm8,%xmm8 - vaddps (CO2, LDC), %xmm10,%xmm10 + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 #endif @@ -557,11 +557,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(TRMMKERNEL) - vaddps (CO1), %xmm4,%xmm4 - vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 #endif @@ -597,8 +597,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(TRMMKERNEL) - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 #endif From 9299d8cfd608ee4ec7f07583d4e4cb1c67f359a6 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 8 Apr 2015 16:29:55 +0200 Subject: [PATCH 121/257] added optimized cdot- and zdot-kernels for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 7 +- kernel/x86_64/KERNEL.PILEDRIVER | 3 + kernel/x86_64/KERNEL.STEAMROLLER | 5 +- kernel/x86_64/cdot.c | 174 +++++++++++++++++++++ kernel/x86_64/cdot_microk_bulldozer-2.c | 196 ++++++++++++++++++++++++ kernel/x86_64/zdot.c | 165 ++++++++++++++++++++ kernel/x86_64/zdot_microk_bulldozer-2.c | 115 ++++++++++++++ 7 files changed, 662 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/cdot.c create mode 100644 kernel/x86_64/cdot_microk_bulldozer-2.c create mode 100644 kernel/x86_64/zdot.c create mode 100644 kernel/x86_64/zdot_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 791c18146..ef1108646 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -3,7 +3,8 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c -#DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c @@ -26,11 +27,11 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S - SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S @@ -40,6 +41,7 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c @@ -49,6 +51,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S ZGEMMINCOPY = ZGEMMITCOPY = diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index ec70253b8..6eddebdad 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -12,6 +12,9 @@ DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S SDOTKERNEL = sdot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index f5b5cb942..8926010d3 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -3,7 +3,10 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c -DDOTKERNEL = ddot.c +DDOTKERNEL = ddot_bullozer.S +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c new file mode 100644 index 000000000..1e9e3204b --- /dev/null +++ b/kernel/x86_64/cdot.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + + +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#include "cdot_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "cdot_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "cdot_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "cdot_microk_sandy-2.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot[8] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + BLASLONG j=0; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[4] += x[j] * y[j+1] ; + dot[5] += x[j+1] * y[j] ; + + dot[2] += x[j+2] * y[j+2] ; + dot[3] += x[j+3] * y[j+3] ; + dot[6] += x[j+2] * y[j+3] ; + dot[7] += x[j+3] * y[j+2] ; + + dot[0] += x[j+4] * y[j+4] ; + dot[1] += x[j+5] * y[j+5] ; + dot[4] += x[j+4] * y[j+5] ; + dot[5] += x[j+5] * y[j+4] ; + + dot[2] += x[j+6] * y[j+6] ; + dot[3] += x[j+7] * y[j+7] ; + dot[6] += x[j+6] * y[j+7] ; + dot[7] += x[j+7] * y[j+6] ; + + j+=8; + i+=4; + + } + d[0] = dot[0]; + d[1] = dot[1]; + d[2] = dot[2]; + d[3] = dot[3]; + d[4] = dot[4]; + d[5] = dot[5]; + d[6] = dot[6]; + d[7] = dot[7]; + +} + +#endif + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + FLOAT _Complex result; + FLOAT dot[8] = { 0.0, 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0, 0.0 } ; + + if ( n <= 0 ) + { + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + return(result); + + } + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -16; + + if ( n1 ) + { + cdot_kernel_16(n1, x, y , dot ); + dot[0] += dot[2]; + dot[1] += dot[3]; + dot[4] += dot[6]; + dot[5] += dot[7]; + } + i = n1; + int j = i * 2; + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[4] += x[j] * y[j+1] ; + dot[5] += x[j+1] * y[j] ; + + j+=2; + i++ ; + + } + + + } + else + { + i=0; + ix=0; + iy=0; + inc_x <<= 1; + inc_y <<= 1; + while(i < n) + { + + dot[0] += x[ix] * y[iy] ; + dot[1] += x[ix+1] * y[iy+1] ; + dot[4] += x[ix] * y[iy+1] ; + dot[5] += x[ix+1] * y[iy] ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + } + +#if !defined(CONJ) + __real__ result = dot[0] - dot[1]; + __imag__ result = dot[4] + dot[5]; +#else + __real__ result = dot[0] + dot[1]; + __imag__ result = dot[4] - dot[5]; + +#endif + + return(result); + +} + + diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c new file mode 100644 index 000000000..f587aa036 --- /dev/null +++ b/kernel/x86_64/cdot_microk_bulldozer-2.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n <=1024 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "prefetcht0 384(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c new file mode 100644 index 000000000..e13858e06 --- /dev/null +++ b/kernel/x86_64/zdot.c @@ -0,0 +1,165 @@ +/*************************************************************************** +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + + +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#include "zdot_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "zdot_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "zdot_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "zdot_microk_sandy-2.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); + +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; + BLASLONG j=0; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[2] += x[j] * y[j+1] ; + dot[3] += x[j+1] * y[j] ; + + dot[0] += x[j+2] * y[j+2] ; + dot[1] += x[j+3] * y[j+3] ; + dot[2] += x[j+2] * y[j+3] ; + dot[3] += x[j+3] * y[j+2] ; + + dot[0] += x[j+4] * y[j+4] ; + dot[1] += x[j+5] * y[j+5] ; + dot[2] += x[j+4] * y[j+5] ; + dot[3] += x[j+5] * y[j+4] ; + + dot[0] += x[j+6] * y[j+6] ; + dot[1] += x[j+7] * y[j+7] ; + dot[2] += x[j+6] * y[j+7] ; + dot[3] += x[j+7] * y[j+6] ; + + j+=8; + i+=4; + + } + d[0] = dot[0]; + d[1] = dot[1]; + d[2] = dot[2]; + d[3] = dot[3]; + +} + +#endif + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + FLOAT _Complex result; + FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; + + if ( n <= 0 ) + { + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + return(result); + + } + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -8; + + if ( n1 ) + zdot_kernel_8(n1, x, y , dot ); + + i = n1; + int j = i * 2; + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[2] += x[j] * y[j+1] ; + dot[3] += x[j+1] * y[j] ; + + j+=2; + i++ ; + + } + + + } + else + { + i=0; + ix=0; + iy=0; + inc_x <<= 1; + inc_y <<= 1; + while(i < n) + { + + dot[0] += x[ix] * y[iy] ; + dot[1] += x[ix+1] * y[iy+1] ; + dot[2] += x[ix] * y[iy+1] ; + dot[3] += x[ix+1] * y[iy] ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + } + +#if !defined(CONJ) + __real__ result = dot[0] - dot[1]; + __imag__ result = dot[2] + dot[3]; +#else + __real__ result = dot[0] + dot[1]; + __imag__ result = dot[2] - dot[3]; + +#endif + + return(result); + +} + + diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c new file mode 100644 index 000000000..d45c4ad38 --- /dev/null +++ b/kernel/x86_64/zdot_microk_bulldozer-2.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 5c511639720d428904cf2c010f550fce090acc12 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 9 Apr 2015 09:45:23 +0200 Subject: [PATCH 122/257] added optimized cdot- and zdot-kernel for steamroller --- kernel/x86_64/KERNEL.STEAMROLLER | 2 +- kernel/x86_64/cdot.c | 4 +- kernel/x86_64/cdot_microk_steamroller-2.c | 196 ++++++++++++++++++++++ kernel/x86_64/zdot.c | 4 +- kernel/x86_64/zdot_microk_steamroller-2.c | 193 +++++++++++++++++++++ 5 files changed, 396 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/cdot_microk_steamroller-2.c create mode 100644 kernel/x86_64/zdot_microk_steamroller-2.c diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index 8926010d3..fbe04ca70 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -3,7 +3,7 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c -DDOTKERNEL = ddot_bullozer.S +DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 1e9e3204b..bfe707310 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) #include "cdot_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "cdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "cdot_microk_nehalem-2.c" #elif defined(HASWELL) diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c new file mode 100644 index 000000000..76a3aa0eb --- /dev/null +++ b/kernel/x86_64/cdot_microk_steamroller-2.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n < 1280 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmadd231ps %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231ps %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231ps %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "prefetcht0 512(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmadd231ps %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231ps %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231ps %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index e13858e06..e11b62ccd 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) #include "zdot_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "zdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "zdot_microk_nehalem-2.c" #elif defined(HASWELL) diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c new file mode 100644 index 000000000..325f74ae3 --- /dev/null +++ b/kernel/x86_64/zdot_microk_steamroller-2.c @@ -0,0 +1,193 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n < 640 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + //"prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + // "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vfmadd231pd %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmadd231pd %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + return; + } + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vfmadd231pd %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmadd231pd %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + From b57a60dac86703d82960e03e63065b435ea06f6c Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 9 Apr 2015 10:33:46 +0200 Subject: [PATCH 123/257] updated cdot and zdot for piledriver --- kernel/x86_64/cdot.c | 4 ++-- kernel/x86_64/zdot.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index bfe707310..e0ba31ae7 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -30,9 +30,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) #include "cdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) #include "cdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "cdot_microk_nehalem-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index e11b62ccd..ee220c70e 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -30,9 +30,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) #include "zdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) #include "zdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "zdot_microk_nehalem-2.c" From fd838c75bc309b02c3a3abfdc3dc857e735f2a37 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 9 Apr 2015 15:13:52 +0200 Subject: [PATCH 124/257] add optimized cdot- and zdot-kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 3 + kernel/x86_64/cdot_microk_haswell-2.c | 119 ++++++++++++++++++++++++++ kernel/x86_64/zdot_microk_haswell-2.c | 119 ++++++++++++++++++++++++++ 3 files changed, 241 insertions(+) create mode 100644 kernel/x86_64/cdot_microk_haswell-2.c create mode 100644 kernel/x86_64/zdot_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index ea3d95872..a6e085d18 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -12,6 +12,9 @@ CGEMVTKERNEL = cgemv_t_4.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + DAXPYKERNEL = daxpy.c SAXPYKERNEL = saxpy.c diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c new file mode 100644 index 000000000..52cedd556 --- /dev/null +++ b/kernel/x86_64/cdot_microk_haswell-2.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%2,%0,4), %%ymm10 \n\t" // 2 * x + "vmovups 96(%2,%0,4), %%ymm11 \n\t" // 2 * x + + "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 2 * y + + "vfmadd231ps %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + + "vfmadd231ps %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + + "vfmadd231ps %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r + "addq $32 , %0 \n\t" + "vfmadd231ps %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231ps %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r + "subq $16 , %1 \n\t" + "vfmadd231ps %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddps %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddps %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddps %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddps %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddps %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + +} + + diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c new file mode 100644 index 000000000..3785713de --- /dev/null +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%2,%0,8), %%ymm10 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 * x + + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" + + "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231pd %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + +} + + From 33b332372ac7c21c1334d5f27013fce0144ac374 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 10 Apr 2015 09:37:26 +0200 Subject: [PATCH 125/257] add optimized cdot- and zdot-kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 3 + kernel/x86_64/cdot_microk_haswell-2.c | 2 +- kernel/x86_64/cdot_microk_sandy-2.c | 127 +++++++++++++++ kernel/x86_64/zdot_microk_haswell-2.c | 2 +- kernel/x86_64/zdot_microk_sandy-2.c | 222 ++++++++++++++++++++++++++ 5 files changed, 354 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/cdot_microk_sandy-2.c create mode 100644 kernel/x86_64/zdot_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index b0b6c6c84..a60f4a17a 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -5,6 +5,9 @@ ZGEMVNKERNEL = zgemv_n_4.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c index 52cedd556..fc76b138a 100644 --- a/kernel/x86_64/cdot_microk_haswell-2.c +++ b/kernel/x86_64/cdot_microk_haswell-2.c @@ -62,8 +62,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231ps %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231ps %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231ps %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231ps %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c new file mode 100644 index 000000000..22cd79e2e --- /dev/null +++ b/kernel/x86_64/cdot_microk_sandy-2.c @@ -0,0 +1,127 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 2 * y + + "vmulps %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddps %%ymm0 , %%ymm10, %%ymm0 \n\t" + "vaddps %%ymm1 , %%ymm11, %%ymm1 \n\t" + "vmulps %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm13, %%ymm11 \n\t" + + "vmovups 64(%2,%0,4), %%ymm8 \n\t" // 2 * x + "vmovups 96(%2,%0,4), %%ymm9 \n\t" // 2 * x + + "vaddps %%ymm4 , %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5 , %%ymm11, %%ymm5 \n\t" + + "vmulps %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddps %%ymm2 , %%ymm10, %%ymm2 \n\t" + "vaddps %%ymm3 , %%ymm11, %%ymm3 \n\t" + "vmulps %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm15, %%ymm11 \n\t" + "addq $32 , %0 \n\t" + "subq $16 , %1 \n\t" + "vaddps %%ymm6 , %%ymm10, %%ymm6 \n\t" + "vaddps %%ymm7 , %%ymm11, %%ymm7 \n\t" + + "jnz 1b \n\t" + + "vaddps %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddps %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddps %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddps %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddps %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddps %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + +} + + diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 3785713de..04a6b971f 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -62,8 +62,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c new file mode 100644 index 000000000..fd06612e6 --- /dev/null +++ b/kernel/x86_64/zdot_microk_sandy-2.c @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + +if ( n < 1280 ) +{ + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vaddpd %%ymm0 , %%ymm10, %%ymm0 \n\t" + "vaddpd %%ymm1 , %%ymm11, %%ymm1 \n\t" + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vmovups 64(%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm9 \n\t" // 2 * x + "vaddpd %%ymm4 , %%ymm10, %%ymm4 \n\t" + "vaddpd %%ymm5 , %%ymm11, %%ymm5 \n\t" + + + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddpd %%ymm2 , %%ymm10, %%ymm2 \n\t" + "vaddpd %%ymm3 , %%ymm11, %%ymm3 \n\t" + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "addq $16 , %0 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vaddpd %%ymm6 , %%ymm10, %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vaddpd %%ymm7 , %%ymm11, %%ymm7 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "prefetcht0 576(%3,%0,8) \n\t" + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "prefetcht0 576(%2,%0,8) \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vaddpd %%ymm0 , %%ymm10, %%ymm0 \n\t" + "vaddpd %%ymm1 , %%ymm11, %%ymm1 \n\t" + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vmovups 64(%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm9 \n\t" // 2 * x + "vaddpd %%ymm4 , %%ymm10, %%ymm4 \n\t" + "vaddpd %%ymm5 , %%ymm11, %%ymm5 \n\t" + + + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddpd %%ymm2 , %%ymm10, %%ymm2 \n\t" + "vaddpd %%ymm3 , %%ymm11, %%ymm3 \n\t" + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "addq $16 , %0 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vaddpd %%ymm6 , %%ymm10, %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vaddpd %%ymm7 , %%ymm11, %%ymm7 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + + +} + + From 3119def9a7cdcbd1b030cd70054dc68a65ead41a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 10 Apr 2015 11:10:31 +0200 Subject: [PATCH 126/257] updated cdot and zdot --- kernel/x86_64/cdot.c | 2 - kernel/x86_64/zdot.c | 2 - kernel/x86_64/zdot_microk_bulldozer-2.c | 85 ++++++++++++++++++++++- kernel/x86_64/zdot_microk_haswell-2.c | 91 +++++++++++++++++++++++++ 4 files changed, 174 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index e0ba31ae7..266ab4fb9 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -34,8 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) #include "cdot_microk_steamroller-2.c" -#elif defined(NEHALEM) -#include "cdot_microk_nehalem-2.c" #elif defined(HASWELL) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index ee220c70e..c0cca521b 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -34,8 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) #include "zdot_microk_steamroller-2.c" -#elif defined(NEHALEM) -#include "zdot_microk_nehalem-2.c" #elif defined(HASWELL) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c index d45c4ad38..30a9552d6 100644 --- a/kernel/x86_64/zdot_microk_bulldozer-2.c +++ b/kernel/x86_64/zdot_microk_bulldozer-2.c @@ -34,6 +34,9 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) BLASLONG register i = 0; + if ( n < 768 ) + { + __asm__ __volatile__ ( "vzeroupper \n\t" @@ -48,11 +51,9 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) ".align 16 \n\t" "1: \n\t" - "prefetcht0 512(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x - "prefetcht0 512(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y @@ -109,6 +110,86 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); + return; + + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + "prefetcht0 384(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + } diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 04a6b971f..810cb4439 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -34,6 +34,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) BLASLONG register i = 0; + if ( n <=1280 ) + { + + __asm__ __volatile__ ( "vzeroupper \n\t" @@ -111,6 +115,93 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); + return; + } + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "prefetcht0 576(%2,%0,8) \n\t" + "vmovups 64(%2,%0,8), %%ymm10 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 * x + + "prefetcht0 576(%3,%0,8) \n\t" + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + + "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231pd %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + From f9f127d838fba5b7f24e5911fbbcaa81331e8272 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 10 Apr 2015 16:18:03 +0200 Subject: [PATCH 127/257] added optimized ddot kernel for steamroller --- kernel/x86_64/ddot.c | 4 +- kernel/x86_64/ddot_microk_steamroller-2.c | 97 +++++++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/ddot_microk_steamroller-2.c diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index f857c2f35..06f018ce8 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -29,8 +29,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) #include "ddot_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "ddot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c new file mode 100644 index 000000000..5ce20b5de --- /dev/null +++ b/kernel/x86_64/ddot_microk_steamroller-2.c @@ -0,0 +1,97 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + + "vfmadd231pd (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y + "vmovups 64(%2,%0,8), %%xmm0 \n\t" // 2 * x + "vmovups 80(%2,%0,8), %%xmm1 \n\t" // 2 * x + "vfmadd231pd 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y + "vmovups 96(%2,%0,8), %%xmm2 \n\t" // 2 * x + "vmovups 112(%2,%0,8), %%xmm3 \n\t" // 2 * x + "vfmadd231pd 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y + "vfmadd231pd 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y + + + "vfmadd231pd 64(%3,%0,8), %%xmm0 , %%xmm4 \n\t" // 2 * y + "vfmadd231pd 80(%3,%0,8), %%xmm1 , %%xmm5 \n\t" // 2 * y + "vfmadd231pd 96(%3,%0,8), %%xmm2 , %%xmm6 \n\t" // 2 * y + "vfmadd231pd 112(%3,%0,8), %%xmm3 , %%xmm7 \n\t" // 2 * y + + "addq $16 , %0 \n\t" + "subq $16 , %1 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 166d76e864565dd69909f64553e88bcab6977907 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 11 Apr 2015 08:48:18 +0200 Subject: [PATCH 128/257] added optimized sdot-kernel for steamroller --- kernel/x86_64/sdot.c | 4 +- kernel/x86_64/sdot_microk_steamroller-2.c | 163 ++++++++++++++++++++++ 2 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sdot_microk_steamroller-2.c diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index edce81b89..28dcd8601 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -28,8 +28,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) #include "sdot_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" #elif defined(HASWELL) diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c new file mode 100644 index 000000000..6b8b2566b --- /dev/null +++ b/kernel/x86_64/sdot_microk_steamroller-2.c @@ -0,0 +1,163 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + + if ( n < 4096 ) + { + + __asm__ __volatile__ + ( + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + + "vfmadd231ps (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y + "vfmadd231ps 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y + "vmovups 64(%2,%0,4), %%xmm0 \n\t" // 4 * x + "vmovups 80(%2,%0,4), %%xmm1 \n\t" // 4 * x + "vfmadd231ps 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y + "vfmadd231ps 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y + + "vmovups 96(%2,%0,4), %%xmm2 \n\t" // 4 * x + "vmovups 112(%2,%0,4), %%xmm3 \n\t" // 4 * x + + "vfmadd231ps 64(%3,%0,4), %%xmm0 , %%xmm4 \n\t" // 4 * y + "vfmadd231ps 80(%3,%0,4), %%xmm1 , %%xmm5 \n\t" // 4 * y + "vfmadd231ps 96(%3,%0,4), %%xmm2 , %%xmm6 \n\t" // 4 * y + "vfmadd231ps 112(%3,%0,4), %%xmm3 , %%xmm7 \n\t" // 4 * y + + "addq $32, %0 \n\t" + "subq $32, %1 \n\t" + "jnz 1b \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovss %%xmm4, (%4) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + + } + + __asm__ __volatile__ + ( + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + + "prefetcht0 512(%3,%0,4) \n\t" + "vfmadd231ps (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y + "vfmadd231ps 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y + "prefetcht0 576(%2,%0,4) \n\t" + "vmovups 64(%2,%0,4), %%xmm0 \n\t" // 4 * x + "vmovups 80(%2,%0,4), %%xmm1 \n\t" // 4 * x + "prefetcht0 576(%3,%0,4) \n\t" + "vfmadd231ps 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y + "vfmadd231ps 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y + + "vmovups 96(%2,%0,4), %%xmm2 \n\t" // 4 * x + "vmovups 112(%2,%0,4), %%xmm3 \n\t" // 4 * x + + "vfmadd231ps 64(%3,%0,4), %%xmm0 , %%xmm4 \n\t" // 4 * y + "vfmadd231ps 80(%3,%0,4), %%xmm1 , %%xmm5 \n\t" // 4 * y + "vfmadd231ps 96(%3,%0,4), %%xmm2 , %%xmm6 \n\t" // 4 * y + "vfmadd231ps 112(%3,%0,4), %%xmm3 , %%xmm7 \n\t" // 4 * y + + "addq $32, %0 \n\t" + "subq $32, %1 \n\t" + "jnz 1b \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovss %%xmm4, (%4) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + From d35f6c63c2790b102d0e1fe30bc36d3de4b48aec Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 13 Apr 2015 12:22:43 +0200 Subject: [PATCH 129/257] add optimized daxpy-kernel for steamroller --- kernel/x86_64/daxpy.c | 4 +- kernel/x86_64/daxpy_microk_steamroller-2.c | 160 +++++++++++++++++++++ 2 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/daxpy_microk_steamroller-2.c diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index c07b5ca15..ea82bfb5c 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "daxpy_microk_nehalem-2.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) #include "daxpy_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "daxpy_microk_steamroller-2.c" #elif defined(HASWELL) #include "daxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c new file mode 100644 index 000000000..e40009037 --- /dev/null +++ b/kernel/x86_64/daxpy_microk_steamroller-2.c @@ -0,0 +1,160 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + if ( n < 2048 ) + { + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y + "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y + "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y + "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y + + "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y + "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y + "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y + "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y + + "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x + "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x + "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x + "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x + + "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x + "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x + "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x + "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x + + "vmovups %%xmm8 , (%3,%0,8) \n\t" + "vmovups %%xmm9 , 16(%3,%0,8) \n\t" + "vmovups %%xmm10, 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y + "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y + "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y + "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y + + "prefetcht0 576(%3,%0,8) \n\t" + "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y + "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y + "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y + "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y + + "prefetcht0 512(%2,%0,8) \n\t" + "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x + "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x + "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x + "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x + + "prefetcht0 576(%2,%0,8) \n\t" + "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x + "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x + "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x + "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x + + "vmovups %%xmm8 , (%3,%0,8) \n\t" + "vmovups %%xmm9 , 16(%3,%0,8) \n\t" + "vmovups %%xmm10, 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + From d7a17ad85d7347102a79915d5291d8446d739270 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 13 Apr 2015 13:19:21 +0200 Subject: [PATCH 130/257] optimized sdot-kernel for pilediver --- kernel/x86_64/sdot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 28dcd8601..c14659013 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) #include "sdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" From 97984819797bb7385ecf90f1b0b553cf3a476bdb Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 13 Apr 2015 19:45:27 -0500 Subject: [PATCH 131/257] Refs #478, #482. Fix segfault bug for gemv_t with MAX_ALLOC_STACK flag. For gemv_t, directly use malloc to create the buffer. --- common.h | 2 ++ driver/others/memory.c | 10 ++++++++++ interface/gemv.c | 28 ++++++++++++++++++++-------- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/common.h b/common.h index fe2083469..cf25fd5b0 100644 --- a/common.h +++ b/common.h @@ -499,6 +499,8 @@ void blas_set_parameter(void); int blas_get_cpu_number(void); void *blas_memory_alloc (int); void blas_memory_free (void *); +void *blas_memory_alloc_nolock (int); //use malloc without blas_lock +void blas_memory_free_nolock (void *); int get_num_procs (void); diff --git a/driver/others/memory.c b/driver/others/memory.c index 4010ec974..12172fd80 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1161,6 +1161,16 @@ void blas_memory_free(void *free_area){ return; } +void *blas_memory_alloc_nolock(int unused) { + void *map_address; + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + return map_address; +} + +void blas_memory_free_nolock(void * map_address) { + free(map_address); +} + void blas_shutdown(void){ int pos; diff --git a/interface/gemv.c b/interface/gemv.c index f33973ef3..6b0aadca0 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -211,15 +211,24 @@ void CNAME(enum CBLAS_ORDER order, #ifdef MAX_STACK_ALLOC // make it volatile because some gemv implementation (ex: dgemv_n.S) // do not restore all register - volatile int stack_alloc_size = m + n; - if(stack_alloc_size < 128) + volatile int stack_alloc_size = 0; + if (trans == 0) { + stack_alloc_size = m + n; + if(stack_alloc_size < 128) //dgemv_n.S require a 128 bytes buffer stack_alloc_size = 128; - if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) + + if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) stack_alloc_size = 0; - FLOAT stack_buffer[stack_alloc_size]; - buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); + FLOAT stack_buffer[stack_alloc_size]; + buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc_nolock(1); + + }else{ + //for gemv_t, only malloc + buffer = (FLOAT *)blas_memory_alloc_nolock(1); + } #else + //Original OpenBLAS/GotoBLAS codes. buffer = (FLOAT *)blas_memory_alloc(1); #endif @@ -251,10 +260,13 @@ void CNAME(enum CBLAS_ORDER order, #endif #ifdef MAX_STACK_ALLOC - if(!stack_alloc_size) -#endif + if(!stack_alloc_size){ + blas_memory_free_nolock(buffer); + } +#else blas_memory_free(buffer); - +#endif + FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; From fd9fd42936736a3fd111e958a2d495cdcac89a42 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 13 Apr 2015 23:22:27 -0500 Subject: [PATCH 132/257] Refs #478, #482. Fixed bug on previous commit. --- interface/gemv.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/interface/gemv.c b/interface/gemv.c index 6b0aadca0..12d27b13c 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -189,7 +189,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif - + //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta); if ((m==0) || (n==0)) return; lenx = n; @@ -213,6 +213,9 @@ void CNAME(enum CBLAS_ORDER order, // do not restore all register volatile int stack_alloc_size = 0; if (trans == 0) { + //for gemv_n, try to allocate on stack + //for gemv_t, use malloc + stack_alloc_size = m + n; if(stack_alloc_size < 128) //dgemv_n.S require a 128 bytes buffer @@ -220,13 +223,11 @@ void CNAME(enum CBLAS_ORDER order, if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) stack_alloc_size = 0; - FLOAT stack_buffer[stack_alloc_size]; - buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc_nolock(1); - - }else{ - //for gemv_t, only malloc - buffer = (FLOAT *)blas_memory_alloc_nolock(1); } + + FLOAT stack_buffer[stack_alloc_size]; + buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc_nolock(1); + // printf("stack_alloc_size=%d\n", stack_alloc_size); #else //Original OpenBLAS/GotoBLAS codes. buffer = (FLOAT *)blas_memory_alloc(1); From 6c3a0b5d462ce1e48a22cc7f7cf125a6fdf9b6df Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 13 Apr 2015 23:23:40 -0500 Subject: [PATCH 133/257] Enable MAX_STACK_ALLOC by default. --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 1479de660..19f3fe3d9 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -162,7 +162,7 @@ COMMON_PROF = -pg # Improve GEMV and GER for small matrices by stack allocation. # For details, https://github.com/xianyi/OpenBLAS/pull/482 # -# MAX_STACK_ALLOC=2048 + MAX_STACK_ALLOC=2048 # Add a prefix or suffix to all exported symbol names in the shared library. # Avoid conflicts with other BLAS libraries, especially when using From 331c417637d6f64cd38c2d2cc9ff562746bca8e9 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 14 Apr 2015 08:34:11 +0200 Subject: [PATCH 134/257] optimized saxpy for piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 1 + kernel/x86_64/saxpy.c | 2 + kernel/x86_64/saxpy_microk_piledriver-2.c | 159 ++++++++++++++++++++++ 3 files changed, 162 insertions(+) create mode 100644 kernel/x86_64/saxpy_microk_piledriver-2.c diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 6eddebdad..514c6bac2 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -1,3 +1,4 @@ +SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index bb24d3cf5..9cc908f07 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" +#elif defined(PILEDRIVER) +#include "saxpy_microk_piledriver-2.c" #endif diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c new file mode 100644 index 000000000..204cf8bac --- /dev/null +++ b/kernel/x86_64/saxpy_microk_piledriver-2.c @@ -0,0 +1,159 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + if ( n < 1024 ) + { + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%xmm8 \n\t" // 4 * y + "vmovups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y + "vmovups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y + "vmovups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y + "vmovups 64(%3,%0,4), %%xmm12 \n\t" // 4 * y + "vmovups 80(%3,%0,4), %%xmm13 \n\t" // 4 * y + "vmovups 96(%3,%0,4), %%xmm14 \n\t" // 4 * y + "vmovups 112(%3,%0,4), %%xmm15 \n\t" // 4 * y + + "vfmadd231ps (%2,%0,4), %%xmm0 , %%xmm8 \n\t" // y += alpha * x + "vfmadd231ps 16(%2,%0,4), %%xmm0 , %%xmm9 \n\t" // y += alpha * x + "vfmadd231ps 32(%2,%0,4), %%xmm0 , %%xmm10 \n\t" // y += alpha * x + "vfmadd231ps 48(%2,%0,4), %%xmm0 , %%xmm11 \n\t" // y += alpha * x + "vfmadd231ps 64(%2,%0,4), %%xmm0 , %%xmm12 \n\t" // y += alpha * x + "vfmadd231ps 80(%2,%0,4), %%xmm0 , %%xmm13 \n\t" // y += alpha * x + "vfmadd231ps 96(%2,%0,4), %%xmm0 , %%xmm14 \n\t" // y += alpha * x + "vfmadd231ps 112(%2,%0,4), %%xmm0 , %%xmm15 \n\t" // y += alpha * x + + "vmovups %%xmm8 , (%3,%0,4) \n\t" + "vmovups %%xmm9 , 16(%3,%0,4) \n\t" + "vmovups %%xmm10, 32(%3,%0,4) \n\t" + "vmovups %%xmm11, 48(%3,%0,4) \n\t" + "vmovups %%xmm12, 64(%3,%0,4) \n\t" + "vmovups %%xmm13, 80(%3,%0,4) \n\t" + "vmovups %%xmm14, 96(%3,%0,4) \n\t" + "vmovups %%xmm15,112(%3,%0,4) \n\t" + + "addq $32, %0 \n\t" + "subq $32, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%3,%0,4) \n\t" + "prefetcht0 576(%3,%0,4) \n\t" + + "vmovups (%3,%0,4), %%xmm8 \n\t" // 4 * y + "vmovups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y + "vmovups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y + "vmovups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y + "vmovups 64(%3,%0,4), %%xmm12 \n\t" // 4 * y + "vmovups 80(%3,%0,4), %%xmm13 \n\t" // 4 * y + "vmovups 96(%3,%0,4), %%xmm14 \n\t" // 4 * y + "vmovups 112(%3,%0,4), %%xmm15 \n\t" // 4 * y + + "prefetcht0 512(%2,%0,4) \n\t" + "prefetcht0 576(%2,%0,4) \n\t" + "vfmadd231ps (%2,%0,4), %%xmm0 , %%xmm8 \n\t" // y += alpha * x + "vfmadd231ps 16(%2,%0,4), %%xmm0 , %%xmm9 \n\t" // y += alpha * x + "vfmadd231ps 32(%2,%0,4), %%xmm0 , %%xmm10 \n\t" // y += alpha * x + "vfmadd231ps 48(%2,%0,4), %%xmm0 , %%xmm11 \n\t" // y += alpha * x + "vfmadd231ps 64(%2,%0,4), %%xmm0 , %%xmm12 \n\t" // y += alpha * x + "vfmadd231ps 80(%2,%0,4), %%xmm0 , %%xmm13 \n\t" // y += alpha * x + "vfmadd231ps 96(%2,%0,4), %%xmm0 , %%xmm14 \n\t" // y += alpha * x + "vfmadd231ps 112(%2,%0,4), %%xmm0 , %%xmm15 \n\t" // y += alpha * x + + "vmovups %%xmm8 , (%3,%0,4) \n\t" + "vmovups %%xmm9 , 16(%3,%0,4) \n\t" + "vmovups %%xmm10, 32(%3,%0,4) \n\t" + "vmovups %%xmm11, 48(%3,%0,4) \n\t" + "vmovups %%xmm12, 64(%3,%0,4) \n\t" + "vmovups %%xmm13, 80(%3,%0,4) \n\t" + "vmovups %%xmm14, 96(%3,%0,4) \n\t" + "vmovups %%xmm15,112(%3,%0,4) \n\t" + + "addq $32, %0 \n\t" + "subq $32, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + From f615dc76038842cd9078c3bdbabc1bc115bb857d Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 14 Apr 2015 09:09:39 +0200 Subject: [PATCH 135/257] added optimized saxpy kernel for steamroller --- kernel/x86_64/KERNEL.STEAMROLLER | 1 + kernel/x86_64/saxpy.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index fbe04ca70..dbdd1fe9b 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -1,3 +1,4 @@ +SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index 9cc908f07..0d2a2923c 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" -#elif defined(PILEDRIVER) +#elif defined(PILEDRIVER) || defined(STEAMROLLER) #include "saxpy_microk_piledriver-2.c" #endif From 34ba66606a036955ef7160dc75ef9e0fb450220f Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 14 Apr 2015 14:23:29 +0200 Subject: [PATCH 136/257] add optimized daxpy-kernel for piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 2 +- kernel/x86_64/daxpy.c | 4 +- kernel/x86_64/daxpy_microk_piledriver-2.c | 160 ++++++++++++++++++++++ 3 files changed, 164 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/daxpy_microk_piledriver-2.c diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 514c6bac2..be8b629d9 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -13,10 +13,10 @@ DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c -DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index ea82bfb5c..65955f33d 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -31,10 +31,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "daxpy_microk_nehalem-2.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) +#elif defined(BULLDOZER) #include "daxpy_microk_bulldozer-2.c" #elif defined(STEAMROLLER) #include "daxpy_microk_steamroller-2.c" +#elif defined(PILEDRIVER) +#include "daxpy_microk_piledriver-2.c" #elif defined(HASWELL) #include "daxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c new file mode 100644 index 000000000..95eb953b4 --- /dev/null +++ b/kernel/x86_64/daxpy_microk_piledriver-2.c @@ -0,0 +1,160 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + if ( n < 640 ) + { + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y + "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y + "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y + "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y + + "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y + "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y + "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y + "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y + + "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x + "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x + "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x + "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x + + "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x + "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x + "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x + "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x + + "vmovups %%xmm8 , (%3,%0,8) \n\t" + "vmovups %%xmm9 , 16(%3,%0,8) \n\t" + "vmovups %%xmm10, 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y + "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y + "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y + "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y + + "prefetcht0 576(%3,%0,8) \n\t" + "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y + "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y + "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y + "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y + + "prefetcht0 512(%2,%0,8) \n\t" + "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x + "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x + "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x + "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x + + "prefetcht0 576(%2,%0,8) \n\t" + "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x + "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x + "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x + "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x + + "vmovups %%xmm8 , (%3,%0,8) \n\t" + "vmovups %%xmm9 , 16(%3,%0,8) \n\t" + "vmovups %%xmm10, 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + From baa0363ea247319959eb3d74229ca6d1e3463094 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 14 Apr 2015 15:09:13 +0200 Subject: [PATCH 137/257] add optimized ddot-kernel for piledriver --- kernel/x86_64/ddot.c | 6 +- kernel/x86_64/ddot_microk_piledriver-2.c | 165 +++++++++++++++++++++++ 2 files changed, 169 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/ddot_microk_piledriver-2.c diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 06f018ce8..c17741cc3 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -29,10 +29,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) #include "ddot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) #include "ddot_microk_steamroller-2.c" +#elif defined(PILEDRIVER) +#include "ddot_microk_piledriver-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c new file mode 100644 index 000000000..ac950885c --- /dev/null +++ b/kernel/x86_64/ddot_microk_piledriver-2.c @@ -0,0 +1,165 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n < 1408 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + + "vfmadd231pd (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y + "vmovups 64(%2,%0,8), %%xmm0 \n\t" // 2 * x + "vmovups 80(%2,%0,8), %%xmm1 \n\t" // 2 * x + "vfmadd231pd 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y + "vmovups 96(%2,%0,8), %%xmm2 \n\t" // 2 * x + "vmovups 112(%2,%0,8), %%xmm3 \n\t" // 2 * x + "vfmadd231pd 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y + "vfmadd231pd 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y + + + "vfmadd231pd 64(%3,%0,8), %%xmm0 , %%xmm4 \n\t" // 2 * y + "vfmadd231pd 80(%3,%0,8), %%xmm1 , %%xmm5 \n\t" // 2 * y + "vfmadd231pd 96(%3,%0,8), %%xmm2 , %%xmm6 \n\t" // 2 * y + "vfmadd231pd 112(%3,%0,8), %%xmm3 , %%xmm7 \n\t" // 2 * y + + "addq $16 , %0 \n\t" + "subq $16 , %1 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 768(%2,%0,8) \n\t" + "prefetcht0 832(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + + "prefetcht0 768(%3,%0,8) \n\t" + "prefetcht0 832(%3,%0,8) \n\t" + "vfmadd231pd (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y + "vmovups 64(%2,%0,8), %%xmm0 \n\t" // 2 * x + "vmovups 80(%2,%0,8), %%xmm1 \n\t" // 2 * x + "vfmadd231pd 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y + "vmovups 96(%2,%0,8), %%xmm2 \n\t" // 2 * x + "vmovups 112(%2,%0,8), %%xmm3 \n\t" // 2 * x + "vfmadd231pd 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y + "vfmadd231pd 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y + + + "vfmadd231pd 64(%3,%0,8), %%xmm0 , %%xmm4 \n\t" // 2 * y + "vfmadd231pd 80(%3,%0,8), %%xmm1 , %%xmm5 \n\t" // 2 * y + "vfmadd231pd 96(%3,%0,8), %%xmm2 , %%xmm6 \n\t" // 2 * y + "vfmadd231pd 112(%3,%0,8), %%xmm3 , %%xmm7 \n\t" // 2 * y + + "addq $16 , %0 \n\t" + "subq $16 , %1 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + From a4c96eca679664a70c34b40026bbc5ad55825fd2 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Wed, 15 Apr 2015 09:41:45 +0200 Subject: [PATCH 138/257] Fix a buffer overflow with MAX_STACK_ALLOC size in dgemv_t Refs #478, #482, 9798481, fd9fd42 --- kernel/x86_64/dgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 5d85ecab7..7c550a759 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -293,7 +293,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n < 1 ) return(0); xbuffer = buffer; - ytemp = buffer + NBMAX; + ytemp = buffer + (m < NBMAX ? m : NBMAX); n0 = n / NBMAX; n1 = (n % NBMAX) >> 2 ; From f5d847122a72631cd622bcaefd1c1e85dcc3d0d6 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 15 Apr 2015 11:59:38 +0200 Subject: [PATCH 139/257] updated caxpy_microk_bulldozer-2.c and caxpy.c --- kernel/x86_64/caxpy.c | 4 +- kernel/x86_64/caxpy_microk_bulldozer-2.c | 173 ++++++++++++++++------- 2 files changed, 121 insertions(+), 56 deletions(-) diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index be945a441..29342f46f 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -78,13 +78,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -8; + int n1 = n & -16; if ( n1 ) { da[0] = da_r; da[1] = da_i; - caxpy_kernel_8(n1, x, y , &da ); + caxpy_kernel_8(n1, x, y , da ); ix = 2 * n1; } i = n1; diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c index 63575c374..33bda0943 100644 --- a/kernel/x86_64/caxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -31,89 +31,154 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __att static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { +#if !defined(CONJ) + FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif BLASLONG register i = 0; + if ( n < 640 ) + { + __asm__ __volatile__ ( + "vzeroupper \n\t" "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulps (%5), %%xmm0 , %%xmm0 \n\t" +#endif ".align 16 \n\t" "1: \n\t" - "prefetcht0 768(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x - "prefetcht0 768(%3,%0,4) \n\t" -#if !defined(CONJ) - "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm12 \n\t" - "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part - "vmulps %%xmm1, %%xmm4 , %%xmm4 \n\t" + "vmovups 64(%2,%0,4), %%xmm12 \n\t" // 2 complex values from x + "vmovups 80(%2,%0,4), %%xmm13 \n\t" // 2 complex values from x + "vmovups 96(%2,%0,4), %%xmm14 \n\t" // 2 complex values from x + "vmovups 112(%2,%0,4), %%xmm15 \n\t" // 2 complex values from x - "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm13 \n\t" + "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part - "vmulps %%xmm1, %%xmm6 , %%xmm6 \n\t" - - "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part - "vmulps %%xmm1, %%xmm8 , %%xmm8 \n\t" - - "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm15 \n\t" "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part - "vmulps %%xmm1, %%xmm10, %%xmm10 \n\t" - - "vaddsubps %%xmm4, %%xmm12, %%xmm12 \n\t" - "vaddsubps %%xmm6, %%xmm13, %%xmm13 \n\t" - "vaddsubps %%xmm8, %%xmm14, %%xmm14 \n\t" - "vaddsubps %%xmm10,%%xmm15, %%xmm15 \n\t" -#else - - "vmulps %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i - "vmulps %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i - "vmulps %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i - "vmulps %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i - "vmulps %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i - "vmulps %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i - "vmulps %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i - "vmulps %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i + "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm5 \n\t" + ".align 2 \n\t" + "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm7 \n\t" + "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm9 \n\t" + "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm11 \n\t" + + "vfmaddps %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmaddps %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmaddps %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmaddps %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vpermilps $0xb1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part + + "vfmaddps 64(%3,%0,4), %%xmm0 , %%xmm12, %%xmm12 \n\t" + "vfmaddps 80(%3,%0,4), %%xmm0 , %%xmm13, %%xmm13 \n\t" + "vfmaddps 96(%3,%0,4), %%xmm0 , %%xmm14, %%xmm14 \n\t" + "vfmaddps 112(%3,%0,4), %%xmm0 , %%xmm15, %%xmm15 \n\t" + + "vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + "vfmaddps %%xmm13, %%xmm1 , %%xmm6 , %%xmm13 \n\t" + "vfmaddps %%xmm14, %%xmm1 , %%xmm8 , %%xmm14 \n\t" + "vfmaddps %%xmm15, %%xmm1 , %%xmm10, %%xmm15 \n\t" + + "vmovups %%xmm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,4) \n\t" + "vmovups %%xmm9 , 32(%3,%0,4) \n\t" + "vmovups %%xmm11, 48(%3,%0,4) \n\t" + "vmovups %%xmm12, 64(%3,%0,4) \n\t" + "vmovups %%xmm13, 80(%3,%0,4) \n\t" + "vmovups %%xmm14, 96(%3,%0,4) \n\t" + "vmovups %%xmm15,112(%3,%0,4) \n\t" + + "addq $32, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" - "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part - "vaddsubps %%xmm4 ,%%xmm5 , %%xmm4 \n\t" - "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } - "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part - "vaddsubps %%xmm6 ,%%xmm7 , %%xmm6 \n\t" - "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulps (%5), %%xmm0 , %%xmm0 \n\t" +#endif - "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part - "vaddsubps %%xmm8 ,%%xmm9 , %%xmm8 \n\t" - "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + ".align 16 \n\t" + "1: \n\t" - "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part - "vaddsubps %%xmm10,%%xmm11, %%xmm10 \n\t" - "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part + "prefetcht0 512(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x - "vaddps (%3,%0,4) ,%%xmm4 , %%xmm12 \n\t" - "vaddps 16(%3,%0,4) ,%%xmm6 , %%xmm13 \n\t" - "vaddps 32(%3,%0,4) ,%%xmm8 , %%xmm14 \n\t" - "vaddps 48(%3,%0,4) ,%%xmm10, %%xmm15 \n\t" + "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + "prefetcht0 512(%3,%0,4) \n\t" + "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm5 \n\t" + ".align 2 \n\t" + "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm7 \n\t" + "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm9 \n\t" + "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm11 \n\t" -#endif + "vfmaddps %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmaddps %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmaddps %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmaddps %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" - "vmovups %%xmm12, (%3,%0,4) \n\t" - "vmovups %%xmm13, 16(%3,%0,4) \n\t" - "vmovups %%xmm14, 32(%3,%0,4) \n\t" - "vmovups %%xmm15, 48(%3,%0,4) \n\t" + "vmovups %%xmm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,4) \n\t" + "vmovups %%xmm9 , 32(%3,%0,4) \n\t" + "vmovups %%xmm11, 48(%3,%0,4) \n\t" "addq $16, %0 \n\t" - "subq $8 , %1 \n\t" + "subq $8, %1 \n\t" "jnz 1b \n\t" + "vzeroupper \n\t" : : @@ -121,15 +186,15 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 - "r" (alpha) // 4 + "r" (alpha), // 4 + "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); -} +} From e9f33b4ca73ca58cbc21fdfc690bd7bc6a0993e9 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 15 Apr 2015 13:49:23 +0200 Subject: [PATCH 140/257] added optimized caxpy-kernel for steamroller --- kernel/x86_64/caxpy.c | 4 +- kernel/x86_64/caxpy_microk_steamroller-2.c | 200 +++++++++++++++++++++ 2 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/caxpy_microk_steamroller-2.c diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index 29342f46f..80d3a763c 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -29,7 +29,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(PILEDRIVER) || defined(STEAMROLLER) +#include "caxpy_microk_steamroller-2.c" +#elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" #endif diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c new file mode 100644 index 000000000..87370b032 --- /dev/null +++ b/kernel/x86_64/caxpy_microk_steamroller-2.c @@ -0,0 +1,200 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + if ( n <= 2048 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulps (%5), %%xmm0 , %%xmm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x + + "vmovups 64(%2,%0,4), %%xmm12 \n\t" // 2 complex values from x + "vmovups 80(%2,%0,4), %%xmm13 \n\t" // 2 complex values from x + "vmovups 96(%2,%0,4), %%xmm14 \n\t" // 2 complex values from x + "vmovups 112(%2,%0,4), %%xmm15 \n\t" // 2 complex values from x + + "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + + "vfmadd213ps (%3,%0,4), %%xmm0 , %%xmm5 \n\t" + ".align 2 \n\t" + "vfmadd213ps 16(%3,%0,4), %%xmm0 , %%xmm7 \n\t" + "vfmadd213ps 32(%3,%0,4), %%xmm0 , %%xmm9 \n\t" + "vfmadd213ps 48(%3,%0,4), %%xmm0 , %%xmm11 \n\t" + + "vfmadd231ps %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmadd231ps %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmadd231ps %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmadd231ps %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vpermilps $0xb1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part + + "vfmadd213ps 64(%3,%0,4), %%xmm0 , %%xmm12 \n\t" + "vfmadd213ps 80(%3,%0,4), %%xmm0 , %%xmm13 \n\t" + "vfmadd213ps 96(%3,%0,4), %%xmm0 , %%xmm14 \n\t" + "vfmadd213ps 112(%3,%0,4), %%xmm0 , %%xmm15 \n\t" + + "vfmadd231ps %%xmm1 , %%xmm4 , %%xmm12 \n\t" + "vfmadd231ps %%xmm1 , %%xmm6 , %%xmm13 \n\t" + "vfmadd231ps %%xmm1 , %%xmm8 , %%xmm14 \n\t" + "vfmadd231ps %%xmm1 , %%xmm10, %%xmm15 \n\t" + + "vmovups %%xmm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,4) \n\t" + "vmovups %%xmm9 , 32(%3,%0,4) \n\t" + "vmovups %%xmm11, 48(%3,%0,4) \n\t" + "vmovups %%xmm12, 64(%3,%0,4) \n\t" + "vmovups %%xmm13, 80(%3,%0,4) \n\t" + "vmovups %%xmm14, 96(%3,%0,4) \n\t" + "vmovups %%xmm15,112(%3,%0,4) \n\t" + + "addq $32, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulps (%5), %%xmm0 , %%xmm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x + + "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + + "prefetcht0 512(%3,%0,4) \n\t" + "vfmadd213ps (%3,%0,4), %%xmm0 , %%xmm5 \n\t" + ".align 2 \n\t" + "vfmadd213ps 16(%3,%0,4), %%xmm0 , %%xmm7 \n\t" + "vfmadd213ps 32(%3,%0,4), %%xmm0 , %%xmm9 \n\t" + "vfmadd213ps 48(%3,%0,4), %%xmm0 , %%xmm11 \n\t" + + "vfmadd231ps %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmadd231ps %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmadd231ps %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmadd231ps %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vmovups %%xmm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,4) \n\t" + "vmovups %%xmm9 , 32(%3,%0,4) \n\t" + "vmovups %%xmm11, 48(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "memory" + ); + + +} + From 248c9340c3e7c3627e0813f02df9d9149a71f812 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 15 Apr 2015 15:16:31 +0200 Subject: [PATCH 141/257] added optimized caxpy-kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 4 +- kernel/x86_64/caxpy.c | 5 +- kernel/x86_64/caxpy_microk_haswell-2.c | 132 +++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/caxpy_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index a6e085d18..9cce7772f 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -15,9 +15,9 @@ DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c - -DAXPYKERNEL = daxpy.c SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index 80d3a763c..ce174c59d 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,6 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "caxpy_microk_haswell-2.c" #endif @@ -80,7 +82,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -16; + int n1 = n & -32; if ( n1 ) { @@ -89,6 +91,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, caxpy_kernel_8(n1, x, y , da ); ix = 2 * n1; } + i = n1; while(i < n) { diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c new file mode 100644 index 000000000..7a9fc1b95 --- /dev/null +++ b/kernel/x86_64/caxpy_microk_haswell-2.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[8] = { -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[8] = { 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%ymm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%ymm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%ymm1 , %%ymm1 \n\t" +#else + "vmulps (%5), %%ymm0 , %%ymm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x + ".align 2 \n\t" + "vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x + "vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x + "vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x + + "vmovups 128(%2,%0,4), %%ymm12 \n\t" // 4 complex values from x + "vmovups 160(%2,%0,4), %%ymm13 \n\t" // 4 complex values from x + "vmovups 192(%2,%0,4), %%ymm14 \n\t" // 4 complex values from x + "vmovups 224(%2,%0,4), %%ymm15 \n\t" // 4 complex values from x + + "vpermilps $0xb1 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part + + "vfmadd213ps (%3,%0,4), %%ymm0 , %%ymm5 \n\t" + ".align 2 \n\t" + "vfmadd213ps 32(%3,%0,4), %%ymm0 , %%ymm7 \n\t" + "vfmadd213ps 64(%3,%0,4), %%ymm0 , %%ymm9 \n\t" + "vfmadd213ps 96(%3,%0,4), %%ymm0 , %%ymm11 \n\t" + + "vfmadd231ps %%ymm1 , %%ymm4 , %%ymm5 \n\t" + "vfmadd231ps %%ymm1 , %%ymm6 , %%ymm7 \n\t" + "vfmadd231ps %%ymm1 , %%ymm8 , %%ymm9 \n\t" + "vfmadd231ps %%ymm1 , %%ymm10, %%ymm11 \n\t" + + "vpermilps $0xb1 , %%ymm12, %%ymm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm13, %%ymm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm14, %%ymm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm15, %%ymm10 \n\t" // exchange real and imag part + + "vfmadd213ps 128(%3,%0,4), %%ymm0 , %%ymm12 \n\t" + "vfmadd213ps 160(%3,%0,4), %%ymm0 , %%ymm13 \n\t" + "vfmadd213ps 192(%3,%0,4), %%ymm0 , %%ymm14 \n\t" + "vfmadd213ps 224(%3,%0,4), %%ymm0 , %%ymm15 \n\t" + + "vfmadd231ps %%ymm1 , %%ymm4 , %%ymm12 \n\t" + "vfmadd231ps %%ymm1 , %%ymm6 , %%ymm13 \n\t" + "vfmadd231ps %%ymm1 , %%ymm8 , %%ymm14 \n\t" + "vfmadd231ps %%ymm1 , %%ymm10, %%ymm15 \n\t" + + "vmovups %%ymm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%ymm7 , 32(%3,%0,4) \n\t" + "vmovups %%ymm9 , 64(%3,%0,4) \n\t" + "vmovups %%ymm11, 96(%3,%0,4) \n\t" + + "vmovups %%ymm12,128(%3,%0,4) \n\t" + "vmovups %%ymm13,160(%3,%0,4) \n\t" + "vmovups %%ymm14,192(%3,%0,4) \n\t" + "vmovups %%ymm15,224(%3,%0,4) \n\t" + + "addq $64, %0 \n\t" + "subq $32, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + From 13889515b3531a3b453cad8917305a84468eb7f9 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 15 Apr 2015 16:29:25 +0200 Subject: [PATCH 142/257] added optimized caxpy-kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 1 + kernel/x86_64/caxpy.c | 2 + kernel/x86_64/caxpy_microk_sandy-2.c | 116 +++++++++++++++++++++++++++ 3 files changed, 119 insertions(+) create mode 100644 kernel/x86_64/caxpy_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index a60f4a17a..b783c9a90 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -11,6 +11,7 @@ ZDOTKERNEL = zdot.c SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index ce174c59d..455d9d2ce 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_bulldozer-2.c" #elif defined(HASWELL) #include "caxpy_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "caxpy_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c new file mode 100644 index 000000000..dbfce208f --- /dev/null +++ b/kernel/x86_64/caxpy_microk_sandy-2.c @@ -0,0 +1,116 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[8] = { -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[8] = { 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%ymm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%ymm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%ymm1 , %%ymm1 \n\t" +#else + "vmulps (%5), %%ymm0 , %%ymm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x + ".align 2 \n\t" + "vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x + "vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x + "vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x + + "vpermilps $0xb1 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part + + "vmulps %%ymm5 , %%ymm0 , %%ymm5 \n\t" + "vmulps %%ymm7 , %%ymm0 , %%ymm7 \n\t" + "vmulps %%ymm9 , %%ymm0 , %%ymm9 \n\t" + "vmulps %%ymm11, %%ymm0 , %%ymm11 \n\t" + + "vaddps (%3,%0,4), %%ymm5 , %%ymm5 \n\t" + "vaddps 32(%3,%0,4), %%ymm7 , %%ymm7 \n\t" + "vaddps 64(%3,%0,4), %%ymm9 , %%ymm9 \n\t" + "vaddps 96(%3,%0,4), %%ymm11, %%ymm11 \n\t" + + "vmulps %%ymm4 , %%ymm1 , %%ymm4 \n\t" + "vmulps %%ymm6 , %%ymm1 , %%ymm6 \n\t" + "vmulps %%ymm8 , %%ymm1 , %%ymm8 \n\t" + "vmulps %%ymm10, %%ymm1 , %%ymm10 \n\t" + + "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vaddps %%ymm6 , %%ymm7 , %%ymm7 \n\t" + "vaddps %%ymm8 , %%ymm9 , %%ymm9 \n\t" + "vaddps %%ymm10, %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%ymm7 , 32(%3,%0,4) \n\t" + "vmovups %%ymm9 , 64(%3,%0,4) \n\t" + "vmovups %%ymm11, 96(%3,%0,4) \n\t" + + "addq $32, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + From 6d0db0151f0ee4c2ac9561d4b1ece7db1d96fdef Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 16 Apr 2015 11:19:37 +0200 Subject: [PATCH 143/257] added optimized zaxpy-kernels --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/KERNEL.SANDYBRIDGE | 1 + kernel/x86_64/zaxpy.c | 12 +- kernel/x86_64/zaxpy_microk_bulldozer-2.c | 173 ++++++++++++------ kernel/x86_64/zaxpy_microk_haswell-2.c | 132 ++++++++++++++ kernel/x86_64/zaxpy_microk_sandy-2.c | 198 ++++++++++++++++++++ kernel/x86_64/zaxpy_microk_steamroller-2.c | 200 +++++++++++++++++++++ lapack-netlib/TESTING/sep.in | 4 +- lapack-netlib/TESTING/zctest.in | 2 +- 9 files changed, 663 insertions(+), 60 deletions(-) create mode 100644 kernel/x86_64/zaxpy_microk_haswell-2.c create mode 100644 kernel/x86_64/zaxpy_microk_sandy-2.c create mode 100644 kernel/x86_64/zaxpy_microk_steamroller-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 9cce7772f..36f0762fd 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -18,6 +18,7 @@ ZDOTKERNEL = zdot.c SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index b783c9a90..9ae568159 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -12,6 +12,7 @@ ZDOTKERNEL = zdot.c SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 52a25c793..1aa95d2b9 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -29,8 +29,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) #include "zaxpy_microk_bulldozer-2.c" +#elif defined(PILEDRIVER) || defined(STEAMROLLER) +#include "zaxpy_microk_steamroller-2.c" +#elif defined(HASWELL) +#include "zaxpy_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "zaxpy_microk_sandy-2.c" #endif @@ -78,13 +84,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -4; + int n1 = n & -16; if ( n1 ) { da[0] = da_r; da[1] = da_i; - zaxpy_kernel_4(n1, x, y , &da ); + zaxpy_kernel_4(n1, x, y , da ); ix = 2 * n1; } i = n1; diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index f9732cd4e..0e15761f7 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -31,89 +31,154 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __att static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { +#if !defined(CONJ) + FLOAT mvec[2] = { -1.0, 1.0 }; +#else + FLOAT mvec[2] = { 1.0, -1.0 }; +#endif BLASLONG register i = 0; + if ( n < 384 ) + { + __asm__ __volatile__ ( + "vzeroupper \n\t" "vmovddup (%4), %%xmm0 \n\t" // real part of alpha "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" +#endif ".align 16 \n\t" "1: \n\t" - "prefetcht0 768(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x + ".align 2 \n\t" "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x - "prefetcht0 768(%3,%0,8) \n\t" -#if !defined(CONJ) - "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm12 \n\t" - "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part - "vmulpd %%xmm1, %%xmm4 , %%xmm4 \n\t" + "vmovups 64(%2,%0,8), %%xmm12 \n\t" // 1 complex values from x + "vmovups 80(%2,%0,8), %%xmm13 \n\t" // 1 complex values from x + "vmovups 96(%2,%0,8), %%xmm14 \n\t" // 1 complex values from x + "vmovups 112(%2,%0,8), %%xmm15 \n\t" // 1 complex values from x - "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part - "vmulpd %%xmm1, %%xmm6 , %%xmm6 \n\t" - - "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm14 \n\t" "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part - "vmulpd %%xmm1, %%xmm8 , %%xmm8 \n\t" - - "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm15 \n\t" "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part - "vmulpd %%xmm1, %%xmm10, %%xmm10 \n\t" - - "vaddsubpd %%xmm4, %%xmm12, %%xmm12 \n\t" - "vaddsubpd %%xmm6, %%xmm13, %%xmm13 \n\t" - "vaddsubpd %%xmm8, %%xmm14, %%xmm14 \n\t" - "vaddsubpd %%xmm10,%%xmm15, %%xmm15 \n\t" -#else - - "vmulpd %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i - "vmulpd %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i - "vmulpd %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i - "vmulpd %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i - "vmulpd %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i - "vmulpd %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i - "vmulpd %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i - "vmulpd %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i + "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm5 \n\t" + ".align 2 \n\t" + "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm7 \n\t" + "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm9 \n\t" + "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm11 \n\t" + + "vfmaddpd %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmaddpd %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmaddpd %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmaddpd %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vpermilpd $0x1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part + + "vfmaddpd 64(%3,%0,8), %%xmm0 , %%xmm12, %%xmm12 \n\t" + "vfmaddpd 80(%3,%0,8), %%xmm0 , %%xmm13, %%xmm13 \n\t" + "vfmaddpd 96(%3,%0,8), %%xmm0 , %%xmm14, %%xmm14 \n\t" + "vfmaddpd 112(%3,%0,8), %%xmm0 , %%xmm15, %%xmm15 \n\t" + + "vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + "vfmaddpd %%xmm13, %%xmm1 , %%xmm6 , %%xmm13 \n\t" + "vfmaddpd %%xmm14, %%xmm1 , %%xmm8 , %%xmm14 \n\t" + "vfmaddpd %%xmm15, %%xmm1 , %%xmm10, %%xmm15 \n\t" + + "vmovups %%xmm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,8) \n\t" + "vmovups %%xmm9 , 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" - "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part - "vaddsubpd %%xmm4 ,%%xmm5 , %%xmm4 \n\t" - "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } - "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part - "vaddsubpd %%xmm6 ,%%xmm7 , %%xmm6 \n\t" - "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vmovddup (%4), %%xmm0 \n\t" // real part of alpha + "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" +#endif - "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part - "vaddsubpd %%xmm8 ,%%xmm9 , %%xmm8 \n\t" - "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + ".align 16 \n\t" + "1: \n\t" - "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part - "vaddsubpd %%xmm10,%%xmm11, %%xmm10 \n\t" - "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x + "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x - "vaddpd (%3,%0,8) ,%%xmm4 , %%xmm12 \n\t" - "vaddpd 16(%3,%0,8) ,%%xmm6 , %%xmm13 \n\t" - "vaddpd 32(%3,%0,8) ,%%xmm8 , %%xmm14 \n\t" - "vaddpd 48(%3,%0,8) ,%%xmm10, %%xmm15 \n\t" + "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + "prefetcht0 512(%3,%0,8) \n\t" + "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm5 \n\t" + ".align 2 \n\t" + "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm7 \n\t" + "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm9 \n\t" + "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm11 \n\t" -#endif + "vfmaddpd %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmaddpd %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmaddpd %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmaddpd %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" - "vmovups %%xmm12, (%3,%0,8) \n\t" - "vmovups %%xmm13, 16(%3,%0,8) \n\t" - "vmovups %%xmm14, 32(%3,%0,8) \n\t" - "vmovups %%xmm15, 48(%3,%0,8) \n\t" + "vmovups %%xmm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,8) \n\t" + "vmovups %%xmm9 , 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" "addq $8 , %0 \n\t" - "subq $4 , %1 \n\t" + "subq $4, %1 \n\t" "jnz 1b \n\t" + "vzeroupper \n\t" : : @@ -121,15 +186,15 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 - "r" (alpha) // 4 + "r" (alpha), // 4 + "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); -} +} diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c new file mode 100644 index 000000000..e7e559502 --- /dev/null +++ b/kernel/x86_64/zaxpy_microk_haswell-2.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha + "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" +#else + "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,8), %%ymm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 2 complex values from x + "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 2 complex values from x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 complex values from x + + "vmovups 128(%2,%0,8), %%ymm12 \n\t" // 2 complex values from x + "vmovups 160(%2,%0,8), %%ymm13 \n\t" // 2 complex values from x + "vmovups 192(%2,%0,8), %%ymm14 \n\t" // 2 complex values from x + "vmovups 224(%2,%0,8), %%ymm15 \n\t" // 2 complex values from x + + "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part + + "vfmadd213pd (%3,%0,8), %%ymm0 , %%ymm5 \n\t" + ".align 2 \n\t" + "vfmadd213pd 32(%3,%0,8), %%ymm0 , %%ymm7 \n\t" + "vfmadd213pd 64(%3,%0,8), %%ymm0 , %%ymm9 \n\t" + "vfmadd213pd 96(%3,%0,8), %%ymm0 , %%ymm11 \n\t" + + "vfmadd231pd %%ymm1 , %%ymm4 , %%ymm5 \n\t" + "vfmadd231pd %%ymm1 , %%ymm6 , %%ymm7 \n\t" + "vfmadd231pd %%ymm1 , %%ymm8 , %%ymm9 \n\t" + "vfmadd231pd %%ymm1 , %%ymm10, %%ymm11 \n\t" + + "vpermilpd $0x5 , %%ymm12, %%ymm4 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm13, %%ymm6 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm14, %%ymm8 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm15, %%ymm10 \n\t" // exchange real and imag part + + "vfmadd213pd 128(%3,%0,8), %%ymm0 , %%ymm12 \n\t" + "vfmadd213pd 160(%3,%0,8), %%ymm0 , %%ymm13 \n\t" + "vfmadd213pd 192(%3,%0,8), %%ymm0 , %%ymm14 \n\t" + "vfmadd213pd 224(%3,%0,8), %%ymm0 , %%ymm15 \n\t" + + "vfmadd231pd %%ymm1 , %%ymm4 , %%ymm12 \n\t" + "vfmadd231pd %%ymm1 , %%ymm6 , %%ymm13 \n\t" + "vfmadd231pd %%ymm1 , %%ymm8 , %%ymm14 \n\t" + "vfmadd231pd %%ymm1 , %%ymm10, %%ymm15 \n\t" + + "vmovups %%ymm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%ymm7 , 32(%3,%0,8) \n\t" + "vmovups %%ymm9 , 64(%3,%0,8) \n\t" + "vmovups %%ymm11, 96(%3,%0,8) \n\t" + + "vmovups %%ymm12,128(%3,%0,8) \n\t" + "vmovups %%ymm13,160(%3,%0,8) \n\t" + "vmovups %%ymm14,192(%3,%0,8) \n\t" + "vmovups %%ymm15,224(%3,%0,8) \n\t" + + "addq $32, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c new file mode 100644 index 000000000..8b0a7ed05 --- /dev/null +++ b/kernel/x86_64/zaxpy_microk_sandy-2.c @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + if ( n < 1280 ) + { + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha + "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" +#else + "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,8), %%ymm5 \n\t" // 4 complex values from x + ".align 2 \n\t" + "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 4 complex values from x + "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 4 complex values from x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 4 complex values from x + + "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part + + "vmulpd %%ymm5 , %%ymm0 , %%ymm5 \n\t" + "vmulpd %%ymm7 , %%ymm0 , %%ymm7 \n\t" + "vmulpd %%ymm9 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm11, %%ymm0 , %%ymm11 \n\t" + + "vaddpd (%3,%0,8), %%ymm5 , %%ymm5 \n\t" + "vaddpd 32(%3,%0,8), %%ymm7 , %%ymm7 \n\t" + "vaddpd 64(%3,%0,8), %%ymm9 , %%ymm9 \n\t" + "vaddpd 96(%3,%0,8), %%ymm11, %%ymm11 \n\t" + + "vmulpd %%ymm4 , %%ymm1 , %%ymm4 \n\t" + "vmulpd %%ymm6 , %%ymm1 , %%ymm6 \n\t" + "vmulpd %%ymm8 , %%ymm1 , %%ymm8 \n\t" + "vmulpd %%ymm10, %%ymm1 , %%ymm10 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm6 , %%ymm7 , %%ymm7 \n\t" + "vaddpd %%ymm8 , %%ymm9 , %%ymm9 \n\t" + "vaddpd %%ymm10, %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%ymm7 , 32(%3,%0,8) \n\t" + "vmovups %%ymm9 , 64(%3,%0,8) \n\t" + "vmovups %%ymm11, 96(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "memory" + ); + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha + "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" +#else + "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%2,%0,8) \n\t" + "prefetcht0 576(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%ymm5 \n\t" // 4 complex values from x + ".align 2 \n\t" + "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 4 complex values from x + "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 4 complex values from x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 4 complex values from x + + "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part + + "vmulpd %%ymm5 , %%ymm0 , %%ymm5 \n\t" + "vmulpd %%ymm7 , %%ymm0 , %%ymm7 \n\t" + "vmulpd %%ymm9 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm11, %%ymm0 , %%ymm11 \n\t" + + "prefetcht0 512(%3,%0,8) \n\t" + "prefetcht0 576(%3,%0,8) \n\t" + "vaddpd (%3,%0,8), %%ymm5 , %%ymm5 \n\t" + "vaddpd 32(%3,%0,8), %%ymm7 , %%ymm7 \n\t" + "vaddpd 64(%3,%0,8), %%ymm9 , %%ymm9 \n\t" + "vaddpd 96(%3,%0,8), %%ymm11, %%ymm11 \n\t" + + "vmulpd %%ymm4 , %%ymm1 , %%ymm4 \n\t" + "vmulpd %%ymm6 , %%ymm1 , %%ymm6 \n\t" + "vmulpd %%ymm8 , %%ymm1 , %%ymm8 \n\t" + "vmulpd %%ymm10, %%ymm1 , %%ymm10 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm6 , %%ymm7 , %%ymm7 \n\t" + "vaddpd %%ymm8 , %%ymm9 , %%ymm9 \n\t" + "vaddpd %%ymm10, %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%ymm7 , 32(%3,%0,8) \n\t" + "vmovups %%ymm9 , 64(%3,%0,8) \n\t" + "vmovups %%ymm11, 96(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "memory" + ); + + + + +} + diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c new file mode 100644 index 000000000..728d09213 --- /dev/null +++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c @@ -0,0 +1,200 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[2] = { -1.0, 1.0 }; +#else + FLOAT mvec[2] = { 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + if ( n < 640 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vmovddup (%4), %%xmm0 \n\t" // real part of alpha + "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,8), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 2 complex values from x + + "vmovups 64(%2,%0,8), %%xmm12 \n\t" // 2 complex values from x + "vmovups 80(%2,%0,8), %%xmm13 \n\t" // 2 complex values from x + "vmovups 96(%2,%0,8), %%xmm14 \n\t" // 2 complex values from x + "vmovups 112(%2,%0,8), %%xmm15 \n\t" // 2 complex values from x + + "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + + "vfmadd213pd (%3,%0,8), %%xmm0 , %%xmm5 \n\t" + ".align 2 \n\t" + "vfmadd213pd 16(%3,%0,8), %%xmm0 , %%xmm7 \n\t" + "vfmadd213pd 32(%3,%0,8), %%xmm0 , %%xmm9 \n\t" + "vfmadd213pd 48(%3,%0,8), %%xmm0 , %%xmm11 \n\t" + + "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmadd231pd %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vpermilpd $0x1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part + + "vfmadd213pd 64(%3,%0,8), %%xmm0 , %%xmm12 \n\t" + "vfmadd213pd 80(%3,%0,8), %%xmm0 , %%xmm13 \n\t" + "vfmadd213pd 96(%3,%0,8), %%xmm0 , %%xmm14 \n\t" + "vfmadd213pd 112(%3,%0,8), %%xmm0 , %%xmm15 \n\t" + + "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm12 \n\t" + "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm13 \n\t" + "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm14 \n\t" + "vfmadd231pd %%xmm1 , %%xmm10, %%xmm15 \n\t" + + "vmovups %%xmm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,8) \n\t" + "vmovups %%xmm9 , 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vmovddup (%4), %%xmm0 \n\t" // real part of alpha + "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 2 complex values from x + + "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + + "prefetcht0 512(%3,%0,8) \n\t" + "vfmadd213pd (%3,%0,8), %%xmm0 , %%xmm5 \n\t" + ".align 2 \n\t" + "vfmadd213pd 16(%3,%0,8), %%xmm0 , %%xmm7 \n\t" + "vfmadd213pd 32(%3,%0,8), %%xmm0 , %%xmm9 \n\t" + "vfmadd213pd 48(%3,%0,8), %%xmm0 , %%xmm11 \n\t" + + "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmadd231pd %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vmovups %%xmm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,8) \n\t" + "vmovups %%xmm9 , 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "memory" + ); + + +} + diff --git a/lapack-netlib/TESTING/sep.in b/lapack-netlib/TESTING/sep.in index c71a754c7..6ef4329c9 100644 --- a/lapack-netlib/TESTING/sep.in +++ b/lapack-netlib/TESTING/sep.in @@ -1,11 +1,11 @@ SEP: Data file for testing Symmetric Eigenvalue Problem routines 8 Number of values of N -0 1 2 3 5 19 20 21 Values of N (dimension) +0 1 2 3 5 18 19 21 Values of N (dimension) 5 Number of values of NB 1 3 3 3 10 Values of NB (blocksize) 2 2 2 2 2 Values of NBMIN (minimum blocksize) 1 0 5 9 1 Values of NX (crossover point) -170.0 Threshold value +300.0 Threshold value T Put T to test the LAPACK routines T Put T to test the driver routines T Put T to test the error exits diff --git a/lapack-netlib/TESTING/zctest.in b/lapack-netlib/TESTING/zctest.in index ef88cc0d9..48e88ec50 100644 --- a/lapack-netlib/TESTING/zctest.in +++ b/lapack-netlib/TESTING/zctest.in @@ -1,6 +1,6 @@ Data file for testing ZCGESV/ZCPOSV LAPACK routines 11 Number of values of M -0 1 2 13 17 45 78 91 101 120 132 Values of M (row dimension) +0 1 2 13 17 45 78 91 101 121 132 Values of M (row dimension) 4 Number of values of NRHS 1 2 15 16 Values of NRHS (number of right hand sides) 30.0 Threshold value of test ratio From 8e05d291b5ba619dfbc3e7378a0c9887e426a545 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 18 Apr 2015 08:41:41 +0200 Subject: [PATCH 144/257] added scal benchmark --- benchmark/Makefile | 71 ++++++++++++++++ benchmark/scal.c | 202 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 273 insertions(+) create mode 100644 benchmark/scal.c diff --git a/benchmark/Makefile b/benchmark/Makefile index f76c56a26..ffa4f2852 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -42,6 +42,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sger.goto dger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ @@ -63,6 +64,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sger.acml dger.acml \ sdot.acml ddot.acml cdot.acml zdot.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ @@ -84,6 +86,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sger.atlas dger.atlas \ sdot.atlas ddot.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ @@ -106,6 +109,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sger.mkl dger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ @@ -1078,6 +1082,60 @@ zaxpy.atlas : zaxpy.$(SUFFIX) zaxpy.mkl : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +sscal.acml : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.atlas : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.mkl : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +dscal.acml : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.atlas : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.mkl : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +cscal.acml : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.atlas : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.mkl : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zscal.acml : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.atlas : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.mkl : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgemm3m #################################################### @@ -1316,6 +1374,19 @@ caxpy.$(SUFFIX) : axpy.c zaxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + cgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/scal.c b/benchmark/scal.c new file mode 100644 index 000000000..4c2da4d30 --- /dev/null +++ b/benchmark/scal.c @@ -0,0 +1,202 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SCAL + +#ifdef COMPLEX +#ifdef DOUBLE +#define SCAL BLASFUNC(zscal) +#else +#define SCAL BLASFUNC(cscal) +#endif +#else +#ifdef DOUBLE +#define SCAL BLASFUNC(dscal) +#else +#define SCAL BLASFUNC(sscal) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Sun, 19 Apr 2015 11:24:07 +0200 Subject: [PATCH 145/257] added asum benchmark --- benchmark/Makefile | 74 +++++++++++++++++ benchmark/asum.c | 196 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 270 insertions(+) create mode 100644 benchmark/asum.c diff --git a/benchmark/Makefile b/benchmark/Makefile index ffa4f2852..8a322c4f2 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -43,6 +43,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ @@ -65,6 +66,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sdot.acml ddot.acml cdot.acml zdot.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ @@ -87,6 +89,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sdot.atlas ddot.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ @@ -110,6 +113,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ @@ -1136,6 +1140,62 @@ zscal.atlas : zscal.$(SUFFIX) zscal.mkl : zscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +sasum.acml : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.atlas : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.mkl : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +dasum.acml : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.atlas : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.mkl : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +casum.acml : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.atlas : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.mkl : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zasum.acml : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.atlas : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.mkl : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + + ##################################### Cgemm3m #################################################### @@ -1386,6 +1446,20 @@ cscal.$(SUFFIX) : scal.c zscal.$(SUFFIX) : scal.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + cgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/asum.c b/benchmark/asum.c new file mode 100644 index 000000000..beb6402f4 --- /dev/null +++ b/benchmark/asum.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef ASUM + +#ifdef COMPLEX +#ifdef DOUBLE +#define ASUM BLASFUNC(dzasum) +#else +#define ASUM BLASFUNC(scasum) +#endif +#else +#ifdef DOUBLE +#define ASUM BLASFUNC(dasum) +#else +#define ASUM BLASFUNC(sasum) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Mon, 20 Apr 2015 23:22:40 -0500 Subject: [PATCH 146/257] Refs #478,#482, Enable stack alloc for s/dgemv_t.(revert 9798491) --- interface/gemv.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/interface/gemv.c b/interface/gemv.c index 12d27b13c..405fad51f 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -212,21 +212,17 @@ void CNAME(enum CBLAS_ORDER order, // make it volatile because some gemv implementation (ex: dgemv_n.S) // do not restore all register volatile int stack_alloc_size = 0; - if (trans == 0) { - //for gemv_n, try to allocate on stack - //for gemv_t, use malloc + //for gemv_n and gemv_t, try to allocate on stack + stack_alloc_size = m + n; + if(stack_alloc_size < 128) + //dgemv_n.S require a 128 bytes buffer + stack_alloc_size = 128; - stack_alloc_size = m + n; - if(stack_alloc_size < 128) - //dgemv_n.S require a 128 bytes buffer - stack_alloc_size = 128; - - if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) - stack_alloc_size = 0; - } + if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) + stack_alloc_size = 0; FLOAT stack_buffer[stack_alloc_size]; - buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc_nolock(1); + buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); // printf("stack_alloc_size=%d\n", stack_alloc_size); #else //Original OpenBLAS/GotoBLAS codes. @@ -262,7 +258,7 @@ void CNAME(enum CBLAS_ORDER order, #ifdef MAX_STACK_ALLOC if(!stack_alloc_size){ - blas_memory_free_nolock(buffer); + blas_memory_free(buffer); } #else blas_memory_free(buffer); From 3814bf60d302edddd61fe609d2170dade89b459e Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 22 Apr 2015 10:42:50 +0200 Subject: [PATCH 147/257] added optimized dsymv kernels for haswell --- Makefile | 4 +- common_arm.h | 7 +- kernel/x86_64/KERNEL.HASWELL | 3 + kernel/x86_64/dsymv_L.c | 2 + kernel/x86_64/dsymv_L_microk_haswell-2.c | 129 ++++++++++++++++++++++ kernel/x86_64/dsymv_U.c | 2 + kernel/x86_64/dsymv_U_microk_haswell-2.c | 131 +++++++++++++++++++++++ lapack-netlib/TESTING/EIG/Makefile | 2 +- lapack-netlib/TESTING/LIN/Makefile | 2 +- make.inc | 2 +- 10 files changed, 278 insertions(+), 6 deletions(-) create mode 100644 kernel/x86_64/dsymv_L_microk_haswell-2.c create mode 100644 kernel/x86_64/dsymv_U_microk_haswell-2.c diff --git a/Makefile b/Makefile index 3aaf092fc..f8e1345d5 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,8 @@ ifneq ($(NO_LAPACK), 1) SUBDIRS += lapack endif +LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) + SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench .PHONY : all libs netlib test ctest shared install @@ -231,7 +233,7 @@ ifndef NOFORTRAN -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "NOOPT = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/common_arm.h b/common_arm.h index 3cf15848a..eb4234b05 100644 --- a/common_arm.h +++ b/common_arm.h @@ -71,8 +71,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef COMMON_ARM #define COMMON_ARM -#define MB -#define WMB +#define MB __asm__ __volatile__ ("dmb ish" : : : "memory") +#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") #define INLINE inline @@ -88,9 +88,12 @@ static void __inline blas_lock(volatile BLASULONG *address){ while (*address) {YIELDING;}; __asm__ __volatile__( + "1: \n\t" "ldrex r2, [%1] \n\t" "mov r2, #0 \n\t" "strex r3, r2, [%1] \n\t" + "cmp r3, #0 \n\t" + "bne 1b \n\t" "mov %0 , r3 \n\t" : "=r"(ret), "=r"(address) : "1"(address) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 36f0762fd..ec66aeb1f 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -10,6 +10,9 @@ ZGEMVTKERNEL = zgemv_t_4.c CGEMVNKERNEL = cgemv_n_4.c CGEMVTKERNEL = cgemv_t_4.c +DSYMV_L_KERNEL = dsymv_L.c +DSYMV_U_KERNEL = dsymv_U.c + SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index f6157f791..d476ae9bf 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "dsymv_L_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "dsymv_L_microk_haswell-2.c" #elif defined(NEHALEM) #include "dsymv_L_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c new file mode 100644 index 000000000..bc5ec6b87 --- /dev/null +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -0,0 +1,129 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1] + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a + + "vfmadd231pd %%ymm4, %%ymm12 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm12 , %%ymm0 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm5, %%ymm13 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm13 , %%ymm1 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm6, %%ymm14 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm14 , %%ymm2 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm7, %%ymm15 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm15 , %%ymm3 \n\t" // temp2 += x * a + "addq $4 , %0 \n\t" + + "vmovups %%ymm9 , -32(%3,%0,8) \n\t" + + "cmpq %0 , %1 \n\t" + "jnz 1b \n\t" + + "vmovsd (%9), %%xmm4 \n\t" + "vmovsd 8(%9), %%xmm5 \n\t" + "vmovsd 16(%9), %%xmm6 \n\t" + "vmovsd 24(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovsd %%xmm0 , (%9) \n\t" // save temp2 + "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 + "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index ecfaf5043..ca18174b4 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "dsymv_U_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "dsymv_U_microk_haswell-2.c" #elif defined(NEHALEM) #include "dsymv_U_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c new file mode 100644 index 000000000..6ce384f93 --- /dev/null +++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1] + "xorq %0,%0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a + + "vfmadd231pd %%ymm4, %%ymm12 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm12 , %%ymm0 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm5, %%ymm13 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm13 , %%ymm1 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm6, %%ymm14 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm14 , %%ymm2 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm7, %%ymm15 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm15 , %%ymm3 \n\t" // temp2 += x * a + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "vmovups %%ymm9 , -32(%3,%0,8) \n\t" + + "jnz 1b \n\t" + + "vmovsd (%9), %%xmm4 \n\t" + "vmovsd 8(%9), %%xmm5 \n\t" + "vmovsd 16(%9), %%xmm6 \n\t" + "vmovsd 24(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovsd %%xmm0 , (%9) \n\t" // save temp2 + "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 + "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index 128d59ae4..ef6ae0ec1 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -169,4 +169,4 @@ cchkee.o: cchkee.f zchkee.o: zchkee.f $(FORTRAN) $(DRVOPTS) -c $< -o $@ -.f.o : ; $(FORTRAN) $(OPTS) -c $< -o $@ +.f.o : ; $(FORTRAN) $(DRVOPTS) -c $< -o $@ diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 2352da64c..44b05b794 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -338,4 +338,4 @@ zchkaa.o: zchkaa.f $(FORTRAN) $(DRVOPTS) -c $< -o $@ .f.o: - $(FORTRAN) $(OPTS) -c $< -o $@ + $(FORTRAN) $(DRVOPTS) -c $< -o $@ diff --git a/make.inc b/make.inc index 485cb7d48..1fc95b0c6 100644 --- a/make.inc +++ b/make.inc @@ -1,6 +1,6 @@ SHELL = /bin/sh PLAT = _LINUX -DRVOPTS = $(OPTS) +DRVOPTS = $(NOOPT) ARCHFLAGS= -ru #RANLIB = ranlib From 1bec9abb9ad581aab170645451e2ad45fc955dd9 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 22 Apr 2015 12:09:43 +0200 Subject: [PATCH 148/257] added optimized dsymv kernels for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 3 + kernel/x86_64/dsymv_L.c | 2 + kernel/x86_64/dsymv_L_microk_sandy-2.c | 138 ++++++++++++++++++++++++ kernel/x86_64/dsymv_U.c | 2 + kernel/x86_64/dsymv_U_microk_sandy-2.c | 140 +++++++++++++++++++++++++ 5 files changed, 285 insertions(+) create mode 100644 kernel/x86_64/dsymv_L_microk_sandy-2.c create mode 100644 kernel/x86_64/dsymv_U_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index 9ae568159..533088fa9 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -8,6 +8,9 @@ DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c +DSYMV_L_KERNEL = dsymv_L.c +DSYMV_U_KERNEL = dsymv_U.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index d476ae9bf..3f5e77e5f 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dsymv_L_microk_bulldozer-2.c" #elif defined(HASWELL) #include "dsymv_L_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "dsymv_L_microk_sandy-2.c" #elif defined(NEHALEM) #include "dsymv_L_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c new file mode 100644 index 000000000..c87084915 --- /dev/null +++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c @@ -0,0 +1,138 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1] + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a + + "vmulpd %%ymm4, %%ymm12, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm12, %%ymm11 \n\t" + "vaddpd %%ymm0, %%ymm11, %%ymm0 \n\t" + + "vmulpd %%ymm5, %%ymm13, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm13, %%ymm11 \n\t" + "vaddpd %%ymm1, %%ymm11, %%ymm1 \n\t" + + "vmulpd %%ymm6, %%ymm14, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm14, %%ymm11 \n\t" + "vaddpd %%ymm2, %%ymm11, %%ymm2 \n\t" + + "vmulpd %%ymm7, %%ymm15, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm15, %%ymm11 \n\t" + "vaddpd %%ymm3, %%ymm11, %%ymm3 \n\t" + + "addq $4 , %0 \n\t" + + "vmovups %%ymm9 , -32(%3,%0,8) \n\t" + + "cmpq %0 , %1 \n\t" + "jnz 1b \n\t" + + "vmovsd (%9), %%xmm4 \n\t" + "vmovsd 8(%9), %%xmm5 \n\t" + "vmovsd 16(%9), %%xmm6 \n\t" + "vmovsd 24(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovsd %%xmm0 , (%9) \n\t" // save temp2 + "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 + "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index ca18174b4..9f5ae3015 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -33,6 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dsymv_U_microk_bulldozer-2.c" #elif defined(HASWELL) #include "dsymv_U_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "dsymv_U_microk_sandy-2.c" #elif defined(NEHALEM) #include "dsymv_U_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c new file mode 100644 index 000000000..212d4cf7b --- /dev/null +++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1] + "xorq %0,%0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a + + "vmulpd %%ymm4, %%ymm12, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm12, %%ymm11 \n\t" + "vaddpd %%ymm0, %%ymm11, %%ymm0 \n\t" + + "vmulpd %%ymm5, %%ymm13, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm13, %%ymm11 \n\t" + "vaddpd %%ymm1, %%ymm11, %%ymm1 \n\t" + + "vmulpd %%ymm6, %%ymm14, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm14, %%ymm11 \n\t" + "vaddpd %%ymm2, %%ymm11, %%ymm2 \n\t" + + "vmulpd %%ymm7, %%ymm15, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm15, %%ymm11 \n\t" + "vaddpd %%ymm3, %%ymm11, %%ymm3 \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "vmovups %%ymm9 , -32(%3,%0,8) \n\t" + + "jnz 1b \n\t" + + "vmovsd (%9), %%xmm4 \n\t" + "vmovsd 8(%9), %%xmm5 \n\t" + "vmovsd 16(%9), %%xmm6 \n\t" + "vmovsd 24(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovsd %%xmm0 , (%9) \n\t" // save temp2 + "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 + "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 1c4b0eeae35279d2b5121e39985d2227c9359a07 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 23 Apr 2015 10:23:13 +0200 Subject: [PATCH 149/257] added optimized ssymv kernels for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 + kernel/x86_64/ssymv_L.c | 2 + kernel/x86_64/ssymv_L_microk_haswell-2.c | 124 +++++++++++++++++++++ kernel/x86_64/ssymv_U.c | 2 + kernel/x86_64/ssymv_U_microk_haswell-2.c | 136 +++++++++++++++++++++++ 5 files changed, 266 insertions(+) create mode 100644 kernel/x86_64/ssymv_L_microk_haswell-2.c create mode 100644 kernel/x86_64/ssymv_U_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index ec66aeb1f..6849b05d9 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -10,6 +10,8 @@ ZGEMVTKERNEL = zgemv_t_4.c CGEMVNKERNEL = cgemv_n_4.c CGEMVTKERNEL = cgemv_t_4.c +SSYMV_L_KERNEL = ssymv_L.c +SSYMV_U_KERNEL = ssymv_U.c DSYMV_L_KERNEL = dsymv_L.c DSYMV_U_KERNEL = dsymv_U.c diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index a2b716b58..5296c47fc 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "ssymv_L_microk_haswell-2.c" #endif diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c new file mode 100644 index 000000000..516524528 --- /dev/null +++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0] + "vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1] + "vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2] + "vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%xmm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1] + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%xmm9 \n\t" // 2 * y + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + + "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a + "vmovups (%5,%0,4), %%xmm13 \n\t" // 2 * a + "vmovups (%6,%0,4), %%xmm14 \n\t" // 2 * a + "vmovups (%7,%0,4), %%xmm15 \n\t" // 2 * a + + "vfmadd231ps %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a + "vfmadd231ps %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a + + "vfmadd231ps %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a + "vfmadd231ps %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a + + "vfmadd231ps %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a + "vfmadd231ps %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a + + "vfmadd231ps %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a + "vfmadd231ps %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a + + "vmovups %%xmm9 , (%3,%0,4) \n\t" + + "addq $4 , %0 \n\t" + "cmpq %0 , %1 \n\t" + "jnz 1b \n\t" + + "vmovss (%9), %%xmm4 \n\t" + "vmovss 4(%9), %%xmm5 \n\t" + "vmovss 8(%9), %%xmm6 \n\t" + "vmovss 12(%9), %%xmm7 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddss %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddss %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddss %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddss %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 0aadd3fd2..8a8e8cef8 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,6 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "ssymv_U_microk_haswell-2.c" #endif #ifndef HAVE_KERNEL_4x4 diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c new file mode 100644 index 000000000..42f801c96 --- /dev/null +++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorps %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorps %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorps %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%ymm7 \n\t" // temp1[1] + "xorq %0,%0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,4), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,4), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,4), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,4), %%ymm15 \n\t" // 2 * a + + "vfmadd231ps %%ymm4, %%ymm12 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231ps %%ymm8, %%ymm12 , %%ymm0 \n\t" // temp2 += x * a + + "vfmadd231ps %%ymm5, %%ymm13 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231ps %%ymm8, %%ymm13 , %%ymm1 \n\t" // temp2 += x * a + + "vfmadd231ps %%ymm6, %%ymm14 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231ps %%ymm8, %%ymm14 , %%ymm2 \n\t" // temp2 += x * a + + "vfmadd231ps %%ymm7, %%ymm15 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231ps %%ymm8, %%ymm15 , %%ymm3 \n\t" // temp2 += x * a + + "vmovups %%ymm9 , (%3,%0,4) \n\t" + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + "jnz 1b \n\t" + + "vmovss (%9), %%xmm4 \n\t" + "vmovss 4(%9), %%xmm5 \n\t" + "vmovss 8(%9), %%xmm6 \n\t" + "vmovss 12(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddps %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddps %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddps %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddps %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From b4f2153dcd2eaa75f7dfd0f57470efb95857d62c Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 23 Apr 2015 12:19:24 +0200 Subject: [PATCH 150/257] added optimized ssymv kernels for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 3 +- kernel/x86_64/ssymv_L.c | 2 + kernel/x86_64/ssymv_L_microk_sandy-2.c | 243 +++++++++++++++++++++++++ kernel/x86_64/ssymv_U.c | 2 + kernel/x86_64/ssymv_U_microk_sandy-2.c | 144 +++++++++++++++ 5 files changed, 393 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/ssymv_L_microk_sandy-2.c create mode 100644 kernel/x86_64/ssymv_U_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index 533088fa9..055072cfd 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -8,10 +8,11 @@ DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c +SSYMV_L_KERNEL = ssymv_L.c +SSYMV_U_KERNEL = ssymv_U.c DSYMV_L_KERNEL = dsymv_L.c DSYMV_U_KERNEL = dsymv_U.c - SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 5296c47fc..0997f108d 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -34,6 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_nehalem-2.c" #elif defined(HASWELL) #include "ssymv_L_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "ssymv_L_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c new file mode 100644 index 000000000..07293a964 --- /dev/null +++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c @@ -0,0 +1,243 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + if ( ( to - from ) & 4 ) + { + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0] + "vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1] + "vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2] + "vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%xmm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1] + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%xmm9 \n\t" // 2 * y + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + + "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a + "vmovups (%5,%0,4), %%xmm13 \n\t" // 2 * a + "vmovups (%6,%0,4), %%xmm14 \n\t" // 2 * a + "vmovups (%7,%0,4), %%xmm15 \n\t" // 2 * a + + "vmulps %%xmm4, %%xmm12 , %%xmm10 \n\t" + "vmulps %%xmm8, %%xmm12 , %%xmm11 \n\t" + "vaddps %%xmm9, %%xmm10 , %%xmm9 \n\t" + "vaddps %%xmm0, %%xmm11 , %%xmm0 \n\t" + + "vmulps %%xmm5, %%xmm13 , %%xmm10 \n\t" + "vmulps %%xmm8, %%xmm13 , %%xmm11 \n\t" + "vaddps %%xmm9, %%xmm10 , %%xmm9 \n\t" + "vaddps %%xmm1, %%xmm11 , %%xmm1 \n\t" + + "vmulps %%xmm6, %%xmm14 , %%xmm10 \n\t" + "vmulps %%xmm8, %%xmm14 , %%xmm11 \n\t" + "vaddps %%xmm9, %%xmm10 , %%xmm9 \n\t" + "vaddps %%xmm2, %%xmm11 , %%xmm2 \n\t" + + "vmulps %%xmm7, %%xmm15 , %%xmm10 \n\t" + "vmulps %%xmm8, %%xmm15 , %%xmm11 \n\t" + "vaddps %%xmm9, %%xmm10 , %%xmm9 \n\t" + "vaddps %%xmm3, %%xmm11 , %%xmm3 \n\t" + + "vmovups %%xmm9 , (%3,%0,4) \n\t" + + "addq $4 , %0 \n\t" + "cmpq %0 , %1 \n\t" + "jnz 1b \n\t" + + "vmovss (%9), %%xmm4 \n\t" + "vmovss 4(%9), %%xmm5 \n\t" + "vmovss 8(%9), %%xmm6 \n\t" + "vmovss 12(%9), %%xmm7 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddss %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddss %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddss %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddss %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorps %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorps %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorps %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%ymm7 \n\t" // temp1[1] + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,4), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,4), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,4), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,4), %%ymm15 \n\t" // 2 * a + + "vmulps %%ymm4, %%ymm12 , %%ymm10 \n\t" + "vmulps %%ymm8, %%ymm12 , %%ymm11 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vaddps %%ymm0, %%ymm11 , %%ymm0 \n\t" + + "vmulps %%ymm5, %%ymm13 , %%ymm10 \n\t" + "vmulps %%ymm8, %%ymm13 , %%ymm11 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vaddps %%ymm1, %%ymm11 , %%ymm1 \n\t" + + "vmulps %%ymm6, %%ymm14 , %%ymm10 \n\t" + "vmulps %%ymm8, %%ymm14 , %%ymm11 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vaddps %%ymm2, %%ymm11 , %%ymm2 \n\t" + + "vmulps %%ymm7, %%ymm15 , %%ymm10 \n\t" + "vmulps %%ymm8, %%ymm15 , %%ymm11 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vaddps %%ymm3, %%ymm11 , %%ymm3 \n\t" + + "vmovups %%ymm9 , (%3,%0,4) \n\t" + + "addq $8 , %0 \n\t" + "cmpq %0 , %1 \n\t" + "jnz 1b \n\t" + + "vmovss (%9), %%xmm4 \n\t" + "vmovss 4(%9), %%xmm5 \n\t" + "vmovss 8(%9), %%xmm6 \n\t" + "vmovss 12(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddps %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddps %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddps %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddps %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddss %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddss %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddss %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddss %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + +} + + diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 8a8e8cef8..ed1e8236c 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_nehalem-2.c" #elif defined(HASWELL) #include "ssymv_U_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "ssymv_U_microk_sandy-2.c" #endif #ifndef HAVE_KERNEL_4x4 diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c new file mode 100644 index 000000000..4b699af50 --- /dev/null +++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c @@ -0,0 +1,144 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorps %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorps %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorps %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%ymm7 \n\t" // temp1[1] + "xorq %0,%0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,4), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,4), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,4), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,4), %%ymm15 \n\t" // 2 * a + + "vmulps %%ymm4, %%ymm12 , %%ymm10 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vmulps %%ymm8, %%ymm12 , %%ymm11 \n\t" + "vaddps %%ymm0, %%ymm11 , %%ymm0 \n\t" + + "vmulps %%ymm5, %%ymm13 , %%ymm10 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vmulps %%ymm8, %%ymm13 , %%ymm11 \n\t" + "vaddps %%ymm1, %%ymm11 , %%ymm1 \n\t" + + "vmulps %%ymm6, %%ymm14 , %%ymm10 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vmulps %%ymm8, %%ymm14 , %%ymm11 \n\t" + "vaddps %%ymm2, %%ymm11 , %%ymm2 \n\t" + + "vmulps %%ymm7, %%ymm15 , %%ymm10 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vmulps %%ymm8, %%ymm15 , %%ymm11 \n\t" + "vaddps %%ymm3, %%ymm11 , %%ymm3 \n\t" + + "vmovups %%ymm9 , (%3,%0,4) \n\t" + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + "jnz 1b \n\t" + + "vmovss (%9), %%xmm4 \n\t" + "vmovss 4(%9), %%xmm5 \n\t" + "vmovss 8(%9), %%xmm6 \n\t" + "vmovss 12(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddps %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddps %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddps %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddps %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From ab567d84434a2962a2cf4d8803fb9c3eff8b3b96 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Tue, 21 Apr 2015 10:12:01 +0200 Subject: [PATCH 151/257] gemv: Ensure stack buffer is large enough to handle memory alignment Ref #478 --- interface/gemv.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/interface/gemv.c b/interface/gemv.c index 405fad51f..d298d79f6 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -38,6 +38,7 @@ #include #include "common.h" +#include "l1param.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif @@ -214,6 +215,9 @@ void CNAME(enum CBLAS_ORDER order, volatile int stack_alloc_size = 0; //for gemv_n and gemv_t, try to allocate on stack stack_alloc_size = m + n; +#ifdef ALIGNED_ACCESS + stack_alloc_size += 3; +#endif if(stack_alloc_size < 128) //dgemv_n.S require a 128 bytes buffer stack_alloc_size = 128; From 3a67daa95496f549aa23ab0991ef12353b8be50f Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 24 Apr 2015 10:56:55 +0200 Subject: [PATCH 152/257] optimized ddot.c for increments != 1 --- kernel/x86_64/ddot.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index c17741cc3..0f77d5fbc 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -101,15 +101,40 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = y[iy] * x[ix] ; + FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + + FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; + FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + + temp1 += m1+m3; + temp2 += m2+m4; + + i+=4 ; + + } + while(i < n) { - dot += y[iy] * x[ix] ; + temp1 += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } + dot = temp1 + temp2; return(dot); } From 0273966abb890c1a9ec8bcf06d8b07d60be564fc Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 24 Apr 2015 11:39:17 +0200 Subject: [PATCH 153/257] optimized daxpy kernel for increments != 1 --- kernel/x86_64/daxpy.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 65955f33d..10cc573db 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -101,6 +101,27 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + while(i < n) { From dee100d0e4fe60d1d7933d4b8564d3c22669ed16 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 24 Apr 2015 11:52:59 +0200 Subject: [PATCH 154/257] optimized saxpy.c for increments != 1 --- kernel/x86_64/saxpy.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index 0d2a2923c..b37e24d9b 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -76,9 +76,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS { #if defined(SANDYBRIDGE) - int n1 = n & -64; + BLASLONG n1 = n & -64; #else - int n1 = n & -32; + BLASLONG n1 = n & -32; #endif if ( n1 ) @@ -97,6 +97,29 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + + } + + while(i < n) { From c22068c4060dba66dbdfeda28b57c0ac0fff5f82 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 24 Apr 2015 13:13:20 +0200 Subject: [PATCH 155/257] optimized sdot.c for increments != 1 --- kernel/x86_64/sdot.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index c14659013..a6da1fea7 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -80,7 +80,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -32; + BLASLONG n1 = n & -32; if ( n1 ) sdot_kernel_16(n1, x, y , &dot ); @@ -99,6 +99,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + while(i < n) { From fc0e0391f3f8430aa6b70509c34f078eb1438a2c Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 24 Apr 2015 14:30:44 +0200 Subject: [PATCH 156/257] bugfixes: replaced int with BLASLONG --- kernel/x86_64/caxpy.c | 2 +- kernel/x86_64/cdot.c | 6 ++++-- kernel/x86_64/daxpy.c | 4 ++-- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/zaxpy.c | 2 +- kernel/x86_64/zdot.c | 5 +++-- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index 455d9d2ce..1ee0499a7 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -84,7 +84,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -32; + BLASLONG n1 = n & -32; if ( n1 ) { diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 266ab4fb9..2b2c4ff7a 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -109,7 +109,7 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -16; + BLASLONG n1 = n & -16; if ( n1 ) { @@ -119,8 +119,10 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in dot[4] += dot[6]; dot[5] += dot[7]; } + i = n1; - int j = i * 2; + BLASLONG j = i * 2; + while( i < n ) { diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 10cc573db..9207e209f 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -80,9 +80,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS { #if defined(SANDYBRIDGE) - int n1 = n & -32; + BLASLONG n1 = n & -32; #else - int n1 = n & -16; + BLASLONG n1 = n & -16; #endif if ( n1 ) diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 0f77d5fbc..4bf8082c9 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -83,7 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -16; + BLASLONG n1 = n & -16; if ( n1 ) ddot_kernel_8(n1, x, y , &dot ); diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 1aa95d2b9..560acc7f9 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -84,7 +84,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -16; + BLASLONG n1 = n & -16; if ( n1 ) { diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index c0cca521b..eee00fd9f 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -105,13 +105,14 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -8; + BLASLONG n1 = n & -8; if ( n1 ) zdot_kernel_8(n1, x, y , dot ); i = n1; - int j = i * 2; + BLASLONG j = i * 2; + while( i < n ) { From e77db2af31c54e87ff80009b2fba1560643f6213 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 25 Apr 2015 14:53:07 +0200 Subject: [PATCH 157/257] add benchmarks for zgeru and cgeru --- benchmark/Makefile | 41 +++++++++++++++++++++++++++++++++++++---- benchmark/ger.c | 9 ++++++++- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 8a322c4f2..1d2e9ff66 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -39,7 +39,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ - sger.goto dger.goto \ + sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ sscal.goto dscal.goto cscal.goto zscal.goto \ @@ -62,7 +62,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ - sger.acml dger.acml \ + sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml cdot.acml zdot.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ sscal.acml dscal.acml cscal.acml zscal.acml \ @@ -85,7 +85,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ - sger.atlas dger.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ @@ -109,7 +109,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ - sger.mkl dger.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ @@ -736,6 +736,32 @@ dger.atlas : dger.$(SUFFIX) dger.mkl : dger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Cger #################################################### +cger.goto : cger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +cger.acml : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.atlas : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.mkl : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zger #################################################### +zger.goto : zger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zger.acml : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.atlas : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.mkl : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssymv #################################################### ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1348,6 +1374,13 @@ sger.$(SUFFIX) : ger.c dger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +cger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + ssymv.$(SUFFIX) : symv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/ger.c b/benchmark/ger.c index 354281006..a752a3c3e 100644 --- a/benchmark/ger.c +++ b/benchmark/ger.c @@ -35,12 +35,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef GER - +#ifdef COMPLEX +#ifdef DOUBLE +#define GER BLASFUNC(zgeru) +#else +#define GER BLASFUNC(cgeru) +#endif +#else #ifdef DOUBLE #define GER BLASFUNC(dger) #else #define GER BLASFUNC(sger) #endif +#endif #if defined(__WIN32__) || defined(__WIN64__) From e216f686cb9c5fa3a6160af753cfa46e71ea5085 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 28 Apr 2015 10:18:32 +0200 Subject: [PATCH 158/257] optimized saxpy and daxpy for sandybridge --- kernel/x86_64/daxpy.c | 4 -- kernel/x86_64/daxpy_microk_sandy-2.c | 87 ++++++++++++++++----------- kernel/x86_64/saxpy.c | 4 -- kernel/x86_64/saxpy_microk_sandy-2.c | 89 +++++++++++++++++----------- 4 files changed, 107 insertions(+), 77 deletions(-) diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 9207e209f..56d323cbe 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -79,11 +79,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(SANDYBRIDGE) - BLASLONG n1 = n & -32; -#else BLASLONG n1 = n & -16; -#endif if ( n1 ) daxpy_kernel_8(n1, x, y , &da ); diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c index 963ad322d..522e084dc 100644 --- a/kernel/x86_64/daxpy_microk_sandy-2.c +++ b/kernel/x86_64/daxpy_microk_sandy-2.c @@ -37,48 +37,67 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) __asm__ __volatile__ ( "vbroadcastsd (%4), %%ymm0 \n\t" // alpha + "vmovups (%3,%0,8), %%ymm8 \n\t" + "vmovups 32(%3,%0,8), %%ymm9 \n\t" + "vmovups 64(%3,%0,8), %%ymm10 \n\t" + "vmovups 96(%3,%0,8), %%ymm11 \n\t" + "vmovups (%2,%0,8), %%ymm4 \n\t" + "vmovups 32(%2,%0,8), %%ymm5 \n\t" + "vmovups 64(%2,%0,8), %%ymm6 \n\t" + "vmovups 96(%2,%0,8), %%ymm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" + + "vmulpd %%ymm4, %%ymm0, %%ymm4 \n\t" + "vaddpd %%ymm8 , %%ymm4, %%ymm12 \n\t" + "vmulpd %%ymm5, %%ymm0, %%ymm5 \n\t" + "vaddpd %%ymm9 , %%ymm5, %%ymm13 \n\t" + "vmulpd %%ymm6, %%ymm0, %%ymm6 \n\t" + "vaddpd %%ymm10, %%ymm6, %%ymm14 \n\t" + "vmulpd %%ymm7, %%ymm0, %%ymm7 \n\t" + "vaddpd %%ymm11, %%ymm7, %%ymm15 \n\t" + "vmovups (%3,%0,8), %%ymm8 \n\t" "vmovups 32(%3,%0,8), %%ymm9 \n\t" "vmovups 64(%3,%0,8), %%ymm10 \n\t" "vmovups 96(%3,%0,8), %%ymm11 \n\t" - "vmovups 128(%3,%0,8), %%ymm12 \n\t" - "vmovups 160(%3,%0,8), %%ymm13 \n\t" - "vmovups 192(%3,%0,8), %%ymm14 \n\t" - "vmovups 224(%3,%0,8), %%ymm15 \n\t" - - "vmulpd (%2,%0,8), %%ymm0, %%ymm1 \n\t" - "vmulpd 32(%2,%0,8), %%ymm0, %%ymm2 \n\t" - "vaddpd %%ymm8 , %%ymm1, %%ymm8 \n\t" - "vmulpd 64(%2,%0,8), %%ymm0, %%ymm3 \n\t" - "vaddpd %%ymm9 , %%ymm2, %%ymm9 \n\t" - "vmulpd 96(%2,%0,8), %%ymm0, %%ymm4 \n\t" - "vaddpd %%ymm10, %%ymm3, %%ymm10 \n\t" - "vmulpd 128(%2,%0,8), %%ymm0, %%ymm5 \n\t" - "vaddpd %%ymm11, %%ymm4, %%ymm11 \n\t" - "vmulpd 160(%2,%0,8), %%ymm0, %%ymm6 \n\t" - "vaddpd %%ymm12, %%ymm5, %%ymm12 \n\t" - "vmulpd 192(%2,%0,8), %%ymm0, %%ymm7 \n\t" - "vmulpd 224(%2,%0,8), %%ymm0, %%ymm1 \n\t" - - "vaddpd %%ymm13, %%ymm6, %%ymm13 \n\t" - "vmovups %%ymm8 , (%3,%0,8) \n\t" - "vaddpd %%ymm14, %%ymm7, %%ymm14 \n\t" - "vmovups %%ymm9 , 32(%3,%0,8) \n\t" - "vaddpd %%ymm15, %%ymm1, %%ymm15 \n\t" - "vmovups %%ymm10, 64(%3,%0,8) \n\t" - "vmovups %%ymm11, 96(%3,%0,8) \n\t" - "vmovups %%ymm12,128(%3,%0,8) \n\t" - "vmovups %%ymm13,160(%3,%0,8) \n\t" - "vmovups %%ymm14,192(%3,%0,8) \n\t" - "vmovups %%ymm15,224(%3,%0,8) \n\t" - - "addq $32, %0 \n\t" - "subq $32, %1 \n\t" + + "vmovups (%2,%0,8), %%ymm4 \n\t" + "vmovups 32(%2,%0,8), %%ymm5 \n\t" + "vmovups 64(%2,%0,8), %%ymm6 \n\t" + "vmovups 96(%2,%0,8), %%ymm7 \n\t" + + "vmovups %%ymm12, -128(%3,%0,8) \n\t" + "vmovups %%ymm13, -96(%3,%0,8) \n\t" + "vmovups %%ymm14, -64(%3,%0,8) \n\t" + "vmovups %%ymm15, -32(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" "jnz 1b \n\t" - "vzeroupper \n\t" + + "2: \n\t" + "vmulpd %%ymm4, %%ymm0, %%ymm4 \n\t" + "vmulpd %%ymm5, %%ymm0, %%ymm5 \n\t" + "vmulpd %%ymm6, %%ymm0, %%ymm6 \n\t" + "vmulpd %%ymm7, %%ymm0, %%ymm7 \n\t" + + "vaddpd %%ymm8 , %%ymm4, %%ymm12 \n\t" + "vaddpd %%ymm9 , %%ymm5, %%ymm13 \n\t" + "vaddpd %%ymm10, %%ymm6, %%ymm14 \n\t" + "vaddpd %%ymm11, %%ymm7, %%ymm15 \n\t" + + "vmovups %%ymm12, -128(%3,%0,8) \n\t" + "vmovups %%ymm13, -96(%3,%0,8) \n\t" + "vmovups %%ymm14, -64(%3,%0,8) \n\t" + "vmovups %%ymm15, -32(%3,%0,8) \n\t" + + "vzeroupper \n\t" : : diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index b37e24d9b..0b76c42f7 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -75,11 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(SANDYBRIDGE) - BLASLONG n1 = n & -64; -#else BLASLONG n1 = n & -32; -#endif if ( n1 ) saxpy_kernel_16(n1, x, y , &da ); diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c index 8a4392d37..159a23175 100644 --- a/kernel/x86_64/saxpy_microk_sandy-2.c +++ b/kernel/x86_64/saxpy_microk_sandy-2.c @@ -37,48 +37,67 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) __asm__ __volatile__ ( "vbroadcastss (%4), %%ymm0 \n\t" // alpha + "vmovups (%3,%0,4), %%ymm8 \n\t" + "vmovups 32(%3,%0,4), %%ymm9 \n\t" + "vmovups 64(%3,%0,4), %%ymm10 \n\t" + "vmovups 96(%3,%0,4), %%ymm11 \n\t" + "vmovups (%2,%0,4), %%ymm4 \n\t" + "vmovups 32(%2,%0,4), %%ymm5 \n\t" + "vmovups 64(%2,%0,4), %%ymm6 \n\t" + "vmovups 96(%2,%0,4), %%ymm7 \n\t" + + "addq $32, %0 \n\t" + "subq $32, %1 \n\t" + "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" + + "vmulps %%ymm4, %%ymm0, %%ymm4 \n\t" + "vaddps %%ymm8 , %%ymm4, %%ymm12 \n\t" + "vmulps %%ymm5, %%ymm0, %%ymm5 \n\t" + "vaddps %%ymm9 , %%ymm5, %%ymm13 \n\t" + "vmulps %%ymm6, %%ymm0, %%ymm6 \n\t" + "vaddps %%ymm10, %%ymm6, %%ymm14 \n\t" + "vmulps %%ymm7, %%ymm0, %%ymm7 \n\t" + "vaddps %%ymm11, %%ymm7, %%ymm15 \n\t" + "vmovups (%3,%0,4), %%ymm8 \n\t" "vmovups 32(%3,%0,4), %%ymm9 \n\t" "vmovups 64(%3,%0,4), %%ymm10 \n\t" "vmovups 96(%3,%0,4), %%ymm11 \n\t" - "vmovups 128(%3,%0,4), %%ymm12 \n\t" - "vmovups 160(%3,%0,4), %%ymm13 \n\t" - "vmovups 192(%3,%0,4), %%ymm14 \n\t" - "vmovups 224(%3,%0,4), %%ymm15 \n\t" - - "vmulps (%2,%0,4), %%ymm0, %%ymm1 \n\t" - "vmulps 32(%2,%0,4), %%ymm0, %%ymm2 \n\t" - "vaddps %%ymm8 , %%ymm1, %%ymm8 \n\t" - "vmulps 64(%2,%0,4), %%ymm0, %%ymm3 \n\t" - "vaddps %%ymm9 , %%ymm2, %%ymm9 \n\t" - "vmulps 96(%2,%0,4), %%ymm0, %%ymm4 \n\t" - "vaddps %%ymm10, %%ymm3, %%ymm10 \n\t" - "vmulps 128(%2,%0,4), %%ymm0, %%ymm5 \n\t" - "vaddps %%ymm11, %%ymm4, %%ymm11 \n\t" - "vmulps 160(%2,%0,4), %%ymm0, %%ymm6 \n\t" - "vaddps %%ymm12, %%ymm5, %%ymm12 \n\t" - "vmulps 192(%2,%0,4), %%ymm0, %%ymm7 \n\t" - "vmulps 224(%2,%0,4), %%ymm0, %%ymm1 \n\t" - - "vaddps %%ymm13, %%ymm6, %%ymm13 \n\t" - "vmovups %%ymm8 , (%3,%0,4) \n\t" - "vaddps %%ymm14, %%ymm7, %%ymm14 \n\t" - "vmovups %%ymm9 , 32(%3,%0,4) \n\t" - "vaddps %%ymm15, %%ymm1, %%ymm15 \n\t" - "vmovups %%ymm10, 64(%3,%0,4) \n\t" - "vmovups %%ymm11, 96(%3,%0,4) \n\t" - "vmovups %%ymm12,128(%3,%0,4) \n\t" - "vmovups %%ymm13,160(%3,%0,4) \n\t" - "vmovups %%ymm14,192(%3,%0,4) \n\t" - "vmovups %%ymm15,224(%3,%0,4) \n\t" - - "addq $64, %0 \n\t" - "subq $64, %1 \n\t" + + "vmovups (%2,%0,4), %%ymm4 \n\t" + "vmovups 32(%2,%0,4), %%ymm5 \n\t" + "vmovups 64(%2,%0,4), %%ymm6 \n\t" + "vmovups 96(%2,%0,4), %%ymm7 \n\t" + + "vmovups %%ymm12, -128(%3,%0,4) \n\t" + "vmovups %%ymm13, -96(%3,%0,4) \n\t" + "vmovups %%ymm14, -64(%3,%0,4) \n\t" + "vmovups %%ymm15, -32(%3,%0,4) \n\t" + + "addq $32, %0 \n\t" + "subq $32, %1 \n\t" "jnz 1b \n\t" - "vzeroupper \n\t" + + "2: \n\t" + "vmulps %%ymm4, %%ymm0, %%ymm4 \n\t" + "vmulps %%ymm5, %%ymm0, %%ymm5 \n\t" + "vmulps %%ymm6, %%ymm0, %%ymm6 \n\t" + "vmulps %%ymm7, %%ymm0, %%ymm7 \n\t" + + "vaddps %%ymm8 , %%ymm4, %%ymm12 \n\t" + "vaddps %%ymm9 , %%ymm5, %%ymm13 \n\t" + "vaddps %%ymm10, %%ymm6, %%ymm14 \n\t" + "vaddps %%ymm11, %%ymm7, %%ymm15 \n\t" + + "vmovups %%ymm12, -128(%3,%0,4) \n\t" + "vmovups %%ymm13, -96(%3,%0,4) \n\t" + "vmovups %%ymm14, -64(%3,%0,4) \n\t" + "vmovups %%ymm15, -32(%3,%0,4) \n\t" + + "vzeroupper \n\t" : : @@ -90,7 +109,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); From b2e1797dc6397f4096856f1691b769815d60a907 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 28 Apr 2015 15:33:38 +0200 Subject: [PATCH 159/257] added optimized sger kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 2 + kernel/x86_64/sger.c | 84 +++++++++++++++++++ kernel/x86_64/sger_microk_sandy-2.c | 124 ++++++++++++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 kernel/x86_64/sger.c create mode 100644 kernel/x86_64/sger_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index 055072cfd..06bb7f317 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,3 +1,5 @@ +SGERKERNEL = sger.c + SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c diff --git a/kernel/x86_64/sger.c b/kernel/x86_64/sger.c new file mode 100644 index 000000000..84c056c0d --- /dev/null +++ b/kernel/x86_64/sger.c @@ -0,0 +1,84 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#if defined(SANDYBRIDGE) +#include "sger_microk_sandy-2.c" +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, + FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + FLOAT *X = x; + + if (incx != 1) { + X = buffer; + COPY_K(m, x, incx, X, 1); + } + + BLASLONG m1 = m & -16; + + while (n > 0) + { + FLOAT y0 = alpha * *y; + if ( m1 > 0 ) + { + #ifdef HAVE_KERNEL_16 + sger_kernel_16(m1, X, a, &y0); + #else + AXPYU_K(m1, 0, 0, y0, X, 1, a, 1, NULL, 0); + #endif + } + + if ( m > m1 ) + { + AXPYU_K(m-m1, 0, 0, y0, X+m1 , 1, a+m1, 1, NULL, 0); + } + + a += lda; + y += incy; + n --; + } + + return 0; +} + diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c new file mode 100644 index 000000000..51c3bef3e --- /dev/null +++ b/kernel/x86_64/sger_microk_sandy-2.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastss (%4), %%xmm0 \n\t" // alpha + "prefetcht0 256(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%xmm8 \n\t" + "vmovups 16(%3,%0,4), %%xmm9 \n\t" + "vmovups 32(%3,%0,4), %%xmm10 \n\t" + "vmovups 48(%3,%0,4), %%xmm11 \n\t" + + "prefetcht0 256(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm4 \n\t" + "vmovups 16(%2,%0,4), %%xmm5 \n\t" + "vmovups 32(%2,%0,4), %%xmm6 \n\t" + "vmovups 48(%2,%0,4), %%xmm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmulps %%xmm4, %%xmm0, %%xmm4 \n\t" + "vaddps %%xmm8 , %%xmm4, %%xmm12 \n\t" + "vmulps %%xmm5, %%xmm0, %%xmm5 \n\t" + "vaddps %%xmm9 , %%xmm5, %%xmm13 \n\t" + "vmulps %%xmm6, %%xmm0, %%xmm6 \n\t" + "vaddps %%xmm10, %%xmm6, %%xmm14 \n\t" + "vmulps %%xmm7, %%xmm0, %%xmm7 \n\t" + "vaddps %%xmm11, %%xmm7, %%xmm15 \n\t" + + "prefetcht0 256(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%xmm8 \n\t" + "vmovups 16(%3,%0,4), %%xmm9 \n\t" + "vmovups 32(%3,%0,4), %%xmm10 \n\t" + "vmovups 48(%3,%0,4), %%xmm11 \n\t" + + "prefetcht0 256(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm4 \n\t" + "vmovups 16(%2,%0,4), %%xmm5 \n\t" + "vmovups 32(%2,%0,4), %%xmm6 \n\t" + "vmovups 48(%2,%0,4), %%xmm7 \n\t" + + "vmovups %%xmm12, -64(%3,%0,4) \n\t" + "vmovups %%xmm13, -48(%3,%0,4) \n\t" + "vmovups %%xmm14, -32(%3,%0,4) \n\t" + "vmovups %%xmm15, -16(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + "vmulps %%xmm4, %%xmm0, %%xmm4 \n\t" + "vmulps %%xmm5, %%xmm0, %%xmm5 \n\t" + "vmulps %%xmm6, %%xmm0, %%xmm6 \n\t" + "vmulps %%xmm7, %%xmm0, %%xmm7 \n\t" + + "vaddps %%xmm8 , %%xmm4, %%xmm12 \n\t" + "vaddps %%xmm9 , %%xmm5, %%xmm13 \n\t" + "vaddps %%xmm10, %%xmm6, %%xmm14 \n\t" + "vaddps %%xmm11, %%xmm7, %%xmm15 \n\t" + + "vmovups %%xmm12, -64(%3,%0,4) \n\t" + "vmovups %%xmm13, -48(%3,%0,4) \n\t" + "vmovups %%xmm14, -32(%3,%0,4) \n\t" + "vmovups %%xmm15, -16(%3,%0,4) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 5e83d8072565975105e0546f57cb20d81db0aa5a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 28 Apr 2015 16:58:11 +0200 Subject: [PATCH 160/257] optimized dger kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 1 + kernel/x86_64/dger.c | 84 +++++++++++++++++++ kernel/x86_64/dger_microk_sandy-2.c | 124 ++++++++++++++++++++++++++++ 3 files changed, 209 insertions(+) create mode 100644 kernel/x86_64/dger.c create mode 100644 kernel/x86_64/dger_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index 06bb7f317..129d7e5c4 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,4 +1,5 @@ SGERKERNEL = sger.c +DGERKERNEL = dger.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c diff --git a/kernel/x86_64/dger.c b/kernel/x86_64/dger.c new file mode 100644 index 000000000..157a8ea7f --- /dev/null +++ b/kernel/x86_64/dger.c @@ -0,0 +1,84 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#if defined(SANDYBRIDGE) +#include "dger_microk_sandy-2.c" +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, + FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + FLOAT *X = x; + + if (incx != 1) { + X = buffer; + COPY_K(m, x, incx, X, 1); + } + + BLASLONG m1 = m & -16; + + while (n > 0) + { + FLOAT y0 = alpha * *y; + if ( m1 > 0 ) + { + #ifdef HAVE_KERNEL_16 + dger_kernel_16(m1, X, a, &y0); + #else + AXPYU_K(m1, 0, 0, y0, X, 1, a, 1, NULL, 0); + #endif + } + + if ( m > m1 ) + { + AXPYU_K(m-m1, 0, 0, y0, X+m1 , 1, a+m1, 1, NULL, 0); + } + + a += lda; + y += incy; + n --; + } + + return 0; +} + diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c new file mode 100644 index 000000000..564f1356d --- /dev/null +++ b/kernel/x86_64/dger_microk_sandy-2.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + "prefetcht0 256(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm8 \n\t" + "vmovups 16(%3,%0,8), %%xmm9 \n\t" + "vmovups 32(%3,%0,8), %%xmm10 \n\t" + "vmovups 48(%3,%0,8), %%xmm11 \n\t" + + "prefetcht0 256(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm4 \n\t" + "vmovups 16(%2,%0,8), %%xmm5 \n\t" + "vmovups 32(%2,%0,8), %%xmm6 \n\t" + "vmovups 48(%2,%0,8), %%xmm7 \n\t" + + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + "jz 2f \n\t" + + ".align 8 \n\t" + "1: \n\t" + + "vmulpd %%xmm4, %%xmm0, %%xmm4 \n\t" + "vaddpd %%xmm8 , %%xmm4, %%xmm12 \n\t" + "vmulpd %%xmm5, %%xmm0, %%xmm5 \n\t" + "vaddpd %%xmm9 , %%xmm5, %%xmm13 \n\t" + "vmulpd %%xmm6, %%xmm0, %%xmm6 \n\t" + "vaddpd %%xmm10, %%xmm6, %%xmm14 \n\t" + "vmulpd %%xmm7, %%xmm0, %%xmm7 \n\t" + "vaddpd %%xmm11, %%xmm7, %%xmm15 \n\t" + + "prefetcht0 256(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm8 \n\t" + "vmovups 16(%3,%0,8), %%xmm9 \n\t" + "vmovups 32(%3,%0,8), %%xmm10 \n\t" + "vmovups 48(%3,%0,8), %%xmm11 \n\t" + + "prefetcht0 256(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm4 \n\t" + "vmovups 16(%2,%0,8), %%xmm5 \n\t" + "vmovups 32(%2,%0,8), %%xmm6 \n\t" + "vmovups 48(%2,%0,8), %%xmm7 \n\t" + + "vmovups %%xmm12, -64(%3,%0,8) \n\t" + "vmovups %%xmm13, -48(%3,%0,8) \n\t" + "vmovups %%xmm14, -32(%3,%0,8) \n\t" + "vmovups %%xmm15, -16(%3,%0,8) \n\t" + + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + "vmulpd %%xmm4, %%xmm0, %%xmm4 \n\t" + "vmulpd %%xmm5, %%xmm0, %%xmm5 \n\t" + "vmulpd %%xmm6, %%xmm0, %%xmm6 \n\t" + "vmulpd %%xmm7, %%xmm0, %%xmm7 \n\t" + + "vaddpd %%xmm8 , %%xmm4, %%xmm12 \n\t" + "vaddpd %%xmm9 , %%xmm5, %%xmm13 \n\t" + "vaddpd %%xmm10, %%xmm6, %%xmm14 \n\t" + "vaddpd %%xmm11, %%xmm7, %%xmm15 \n\t" + + "vmovups %%xmm12, -64(%3,%0,8) \n\t" + "vmovups %%xmm13, -48(%3,%0,8) \n\t" + "vmovups %%xmm14, -32(%3,%0,8) \n\t" + "vmovups %%xmm15, -16(%3,%0,8) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 642aaba2e0d8d2d8252874df171606cde8d20b83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Villemot?= Date: Wed, 29 Apr 2015 18:14:21 +0200 Subject: [PATCH 161/257] Fix detection of ARM architectures in c_check. This is necessary to avoid the false detection of a cross-compiling environment. --- c_check | 2 ++ 1 file changed, 2 insertions(+) diff --git a/c_check b/c_check index fbe9c9fab..99de07067 100644 --- a/c_check +++ b/c_check @@ -4,6 +4,8 @@ $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); $hostarch = "x86_64" if ($hostarch eq "amd64"); +$hostarch = "arm" if ($hostarch =~ /^arm.*/); +$hostarch = "arm64" if ($hostarch eq "aarch64"); $binary = $ENV{"BINARY"}; From 30f52d53df3eeec0b7f4ad3eb28791b62b227f16 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 30 Apr 2015 12:11:39 +0200 Subject: [PATCH 162/257] optimized dgemv_n kernel for haswell --- kernel/x86_64/dgemv_n_4.c | 71 +------- kernel/x86_64/dgemv_n_microk_haswell-4.c | 199 ++++------------------- kernel/x86_64/sgemv_n_4.c | 2 + 3 files changed, 40 insertions(+), 232 deletions(-) diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 27df12bef..62016fc0b 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -37,48 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 - -#ifndef HAVE_KERNEL_4x8 - -static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - FLOAT *b0,*b1,*b2,*b3; - FLOAT *x4; - FLOAT x[8]; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x4 = x + 4; - - for ( i=0; i<8; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - - y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; - - y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3]; - y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3]; - y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3]; - y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3]; - - } -} - -#endif - - #ifndef HAVE_KERNEL_4x4 static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) @@ -257,7 +215,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG m3; BLASLONG n2; BLASLONG lda4 = lda << 2; - BLASLONG lda8 = lda << 3; FLOAT xbuffer[8],*ybuffer; if ( m < 1 ) return(0); @@ -265,23 +222,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer = buffer; - if ( inc_x == 1 ) - { - n1 = n >> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; + n1 = n >> 2 ; + n2 = n & 3 ; - } - m3 = m & 3 ; m1 = m & -4 ; m2 = (m & (NBMAX-1)) - m3 ; - y_ptr = y; BLASLONG NB = NBMAX; @@ -314,22 +261,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO for( i = 0; i < n1 ; i++) - { - dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) { dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); ap[0] += lda4; ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; a_ptr += lda4; x_ptr += 4; } diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index e1587b57c..b9f64407a 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -27,128 +27,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define HAVE_KERNEL_4x8 1 -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); - -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vbroadcastsd (%2), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 - "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 - "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 - "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 - "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 - - "vbroadcastsd (%9), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" - "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - - "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y - - "addq $4 , %8 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - "2: \n\t" - - "cmpq $0, %1 \n\t" - "je 3f \n\t" - - - ".align 16 \n\t" - "1: \n\t" - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" - "addq $8 , %0 \n\t" - "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" - "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" - "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" - "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" - "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - - "addq $8 , %8 \n\t" - "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y - "subq $8 , %1 \n\t" - "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y - - "jnz 1b \n\t" - - "3: \n\t" - "vzeroupper \n\t" - - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -159,68 +37,59 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT __asm__ __volatile__ ( - "vzeroupper \n\t" "vbroadcastsd (%2), %%ymm12 \n\t" // x0 "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + "vmovups (%4,%0,8), %%ymm0 \n\t" + "vmovups (%5,%0,8), %%ymm1 \n\t" + "vmovups (%6,%0,8), %%ymm2 \n\t" + "vmovups (%7,%0,8), %%ymm3 \n\t" "vbroadcastsd (%8), %%ymm6 \n\t" // alpha - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jz 2f \n\t" - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + ".align 16 \n\t" + "1: \n\t" - "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" + "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" + "vmovups (%4,%0,8), %%ymm0 \n\t" + "vmovups (%5,%0,8), %%ymm1 \n\t" + "vfmadd231pd %%ymm2 , %%ymm14, %%ymm4 \n\t" + "vfmadd231pd %%ymm3 , %%ymm15, %%ymm5 \n\t" + "vmovups (%6,%0,8), %%ymm2 \n\t" + "vmovups (%7,%0,8), %%ymm3 \n\t" + + "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y + "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" + "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y - "2: \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + - "cmpq $0, %1 \n\t" - "je 3f \n\t" + "2: \n\t" + "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" + "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" + "vfmadd231pd %%ymm2 , %%ymm14, %%ymm4 \n\t" + "vfmadd231pd %%ymm3 , %%ymm15, %%ymm5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y + "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y - "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y + "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" - "jnz 1b \n\t" - "3: \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 840ce9207..dc88ea098 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -376,6 +376,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); ap[0] += lda4; ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; a_ptr += lda4; x_ptr += 4; } From 133c11a1563efaf231d3822d67604ac7278145a6 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 30 Apr 2015 14:38:06 +0200 Subject: [PATCH 163/257] updated dgemv_n kernel for nehalem --- kernel/x86_64/dgemv_n_microk_nehalem-4.c | 237 +++++++---------------- 1 file changed, 72 insertions(+), 165 deletions(-) diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-4.c b/kernel/x86_64/dgemv_n_microk_nehalem-4.c index 0d2c24d52..d8c29831a 100644 --- a/kernel/x86_64/dgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c @@ -27,10 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define HAVE_KERNEL_4x8 1 -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; @@ -46,200 +46,107 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "shufpd $0, %%xmm14, %%xmm14\n\t" "shufpd $0, %%xmm15, %%xmm15\n\t" - "movsd 32(%2), %%xmm0 \n\t" // x4 - "movsd 40(%2), %%xmm1 \n\t" // x5 - "movsd 48(%2), %%xmm2 \n\t" // x6 - "movsd 56(%2), %%xmm3 \n\t" // x7 - "shufpd $0, %%xmm0 , %%xmm0 \n\t" - "shufpd $0, %%xmm1 , %%xmm1 \n\t" - "shufpd $0, %%xmm2 , %%xmm2 \n\t" - "shufpd $0, %%xmm3 , %%xmm3 \n\t" - - "movsd (%9), %%xmm6 \n\t" // alpha + "movsd (%8), %%xmm6 \n\t" // alpha "shufpd $0, %%xmm6 , %%xmm6 \n\t" - - ".align 16 \n\t" - "1: \n\t" - "xorpd %%xmm4 , %%xmm4 \n\t" - "xorpd %%xmm5 , %%xmm5 \n\t" - "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y - - ".align 2 \n\t" "movups (%4,%0,8), %%xmm8 \n\t" + "movups 16(%4,%0,8), %%xmm0 \n\t" "movups (%5,%0,8), %%xmm9 \n\t" + "movups 16(%5,%0,8), %%xmm1 \n\t" "movups (%6,%0,8), %%xmm10 \n\t" + "movups 16(%6,%0,8), %%xmm2 \n\t" "movups (%7,%0,8), %%xmm11 \n\t" - ".align 2 \n\t" - "mulpd %%xmm12, %%xmm8 \n\t" - "mulpd %%xmm13, %%xmm9 \n\t" - "mulpd %%xmm14, %%xmm10 \n\t" - "mulpd %%xmm15, %%xmm11 \n\t" - "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm5 \n\t" - "addpd %%xmm10, %%xmm4 \n\t" - "addpd %%xmm11, %%xmm5 \n\t" - - "movups (%4,%8,8), %%xmm8 \n\t" - "movups (%5,%8,8), %%xmm9 \n\t" - "movups (%6,%8,8), %%xmm10 \n\t" - "movups (%7,%8,8), %%xmm11 \n\t" - ".align 2 \n\t" - "mulpd %%xmm0 , %%xmm8 \n\t" - "mulpd %%xmm1 , %%xmm9 \n\t" - "mulpd %%xmm2 , %%xmm10 \n\t" - "mulpd %%xmm3 , %%xmm11 \n\t" - "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm5 \n\t" - "addpd %%xmm10, %%xmm4 \n\t" - "addpd %%xmm11, %%xmm5 \n\t" + "movups 16(%7,%0,8), %%xmm3 \n\t" - "addpd %%xmm5 , %%xmm4 \n\t" - "mulpd %%xmm6 , %%xmm4 \n\t" - "addpd %%xmm4 , %%xmm7 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jz 2f \n\t" - "movups %%xmm7 , (%3,%0,8) \n\t" // 2 * y + ".align 16 \n\t" + "1: \n\t" "xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t" - "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y - - ".align 2 \n\t" - "movups 16(%4,%0,8), %%xmm8 \n\t" - "movups 16(%5,%0,8), %%xmm9 \n\t" - "movups 16(%6,%0,8), %%xmm10 \n\t" - "movups 16(%7,%0,8), %%xmm11 \n\t" - ".align 2 \n\t" + "movups -32(%3,%0,8), %%xmm7 \n\t" // 2 * y + "mulpd %%xmm12, %%xmm8 \n\t" - "mulpd %%xmm13, %%xmm9 \n\t" - "mulpd %%xmm14, %%xmm10 \n\t" - "mulpd %%xmm15, %%xmm11 \n\t" - "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm5 \n\t" - "addpd %%xmm10, %%xmm4 \n\t" - "addpd %%xmm11, %%xmm5 \n\t" - - "movups 16(%4,%8,8), %%xmm8 \n\t" - "movups 16(%5,%8,8), %%xmm9 \n\t" - "movups 16(%6,%8,8), %%xmm10 \n\t" - "movups 16(%7,%8,8), %%xmm11 \n\t" - ".align 2 \n\t" - "mulpd %%xmm0 , %%xmm8 \n\t" - "mulpd %%xmm1 , %%xmm9 \n\t" - "mulpd %%xmm2 , %%xmm10 \n\t" - "mulpd %%xmm3 , %%xmm11 \n\t" + "mulpd %%xmm12, %%xmm0 \n\t" "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm5 \n\t" - "addpd %%xmm10, %%xmm4 \n\t" - "addpd %%xmm11, %%xmm5 \n\t" + "addpd %%xmm0 , %%xmm5 \n\t" - "addq $4 , %8 \n\t" - "addpd %%xmm5 , %%xmm4 \n\t" - "mulpd %%xmm6 , %%xmm4 \n\t" - "addpd %%xmm4 , %%xmm7 \n\t" - - "movups %%xmm7 , 16(%3,%0,8) \n\t" // 2 * y + "movups (%4,%0,8), %%xmm8 \n\t" + "movups 16(%4,%0,8), %%xmm0 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - "jnz 1b \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm13, %%xmm1 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + "addpd %%xmm1 , %%xmm5 \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); + "movups (%5,%0,8), %%xmm9 \n\t" + "movups 16(%5,%0,8), %%xmm1 \n\t" -} + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm14, %%xmm2 \n\t" + "addpd %%xmm10 , %%xmm4 \n\t" + "addpd %%xmm2 , %%xmm5 \n\t" + "movups (%6,%0,8), %%xmm10 \n\t" + "movups 16(%6,%0,8), %%xmm2 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "mulpd %%xmm15, %%xmm3 \n\t" + "addpd %%xmm11 , %%xmm4 \n\t" + "addpd %%xmm3 , %%xmm5 \n\t" + "movups (%7,%0,8), %%xmm11 \n\t" + "movups 16(%7,%0,8), %%xmm3 \n\t" -#define HAVE_KERNEL_4x4 1 -static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ + "mulpd %%xmm6 , %%xmm4 \n\t" + "addpd %%xmm7 , %%xmm4 \n\t" + "movups -16(%3,%0,8), %%xmm7 \n\t" // 2 * y + "movups %%xmm4 , -32(%3,%0,8) \n\t" // 2 * y - BLASLONG register i = 0; + "mulpd %%xmm6 , %%xmm5 \n\t" + "addpd %%xmm7 , %%xmm5 \n\t" + "movups %%xmm5 , -16(%3,%0,8) \n\t" // 2 * y - __asm__ __volatile__ - ( - "movsd (%2), %%xmm12 \n\t" // x0 - "movsd 8(%2), %%xmm13 \n\t" // x1 - "movsd 16(%2), %%xmm14 \n\t" // x2 - "movsd 24(%2), %%xmm15 \n\t" // x3 - "shufpd $0, %%xmm12, %%xmm12\n\t" - "shufpd $0, %%xmm13, %%xmm13\n\t" - "shufpd $0, %%xmm14, %%xmm14\n\t" - "shufpd $0, %%xmm15, %%xmm15\n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" - "movsd (%8), %%xmm6 \n\t" // alpha - "shufpd $0, %%xmm6 , %%xmm6 \n\t" + "2: \n\t" - ".align 16 \n\t" - "1: \n\t" "xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t" - "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y - "movups (%4,%0,8), %%xmm8 \n\t" - "movups (%5,%0,8), %%xmm9 \n\t" - "movups (%6,%0,8), %%xmm10 \n\t" - "movups (%7,%0,8), %%xmm11 \n\t" "mulpd %%xmm12, %%xmm8 \n\t" - "mulpd %%xmm13, %%xmm9 \n\t" - "mulpd %%xmm14, %%xmm10 \n\t" - "mulpd %%xmm15, %%xmm11 \n\t" "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm4 \n\t" - "addpd %%xmm10 , %%xmm4 \n\t" - "addpd %%xmm4 , %%xmm11 \n\t" - - "mulpd %%xmm6 , %%xmm11 \n\t" - "addpd %%xmm7 , %%xmm11 \n\t" - "movups %%xmm11, (%3,%0,8) \n\t" // 2 * y - - "xorpd %%xmm4 , %%xmm4 \n\t" - "xorpd %%xmm5 , %%xmm5 \n\t" - "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y - - "movups 16(%4,%0,8), %%xmm8 \n\t" - "movups 16(%5,%0,8), %%xmm9 \n\t" - "movups 16(%6,%0,8), %%xmm10 \n\t" - "movups 16(%7,%0,8), %%xmm11 \n\t" - "mulpd %%xmm12, %%xmm8 \n\t" "mulpd %%xmm13, %%xmm9 \n\t" - "mulpd %%xmm14, %%xmm10 \n\t" - "mulpd %%xmm15, %%xmm11 \n\t" - "addpd %%xmm8 , %%xmm4 \n\t" "addpd %%xmm9 , %%xmm4 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" "addpd %%xmm10 , %%xmm4 \n\t" - "addpd %%xmm4 , %%xmm11 \n\t" - - "mulpd %%xmm6 , %%xmm11 \n\t" - "addpd %%xmm7 , %%xmm11 \n\t" - "movups %%xmm11, 16(%3,%0,8) \n\t" // 2 * y - - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - "jnz 1b \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm11 , %%xmm4 \n\t" + + "mulpd %%xmm12, %%xmm0 \n\t" + "addpd %%xmm0 , %%xmm5 \n\t" + "mulpd %%xmm13, %%xmm1 \n\t" + "addpd %%xmm1 , %%xmm5 \n\t" + "mulpd %%xmm14, %%xmm2 \n\t" + "addpd %%xmm2 , %%xmm5 \n\t" + "mulpd %%xmm15, %%xmm3 \n\t" + "addpd %%xmm3 , %%xmm5 \n\t" + + "movups -32(%3,%0,8), %%xmm7 \n\t" // 2 * y + "mulpd %%xmm6 , %%xmm4 \n\t" + "addpd %%xmm7 , %%xmm4 \n\t" + "movups %%xmm4 , -32(%3,%0,8) \n\t" // 2 * y + + "movups -16(%3,%0,8), %%xmm7 \n\t" // 2 * y + "mulpd %%xmm6 , %%xmm5 \n\t" + "addpd %%xmm7 , %%xmm5 \n\t" + "movups %%xmm5 , -16(%3,%0,8) \n\t" // 2 * y : : @@ -253,8 +160,8 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" From 924bc5372edd4487fecccb27f4f0b91674a030e8 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 5 May 2015 11:39:43 +0200 Subject: [PATCH 164/257] removed gemm3m functions from normal checks --- ctest/c_c3chke.c | 230 +---- ctest/c_c3chke_3m.c | 1936 ++++++++++++++++++++++++++++++++++++++++++ ctest/c_cblas3.c | 78 -- ctest/c_cblas3_3m.c | 647 +++++++++++++++ ctest/c_z3chke.c | 232 +----- ctest/c_z3chke_3m.c | 1940 +++++++++++++++++++++++++++++++++++++++++++ ctest/c_zblas3.c | 77 -- ctest/c_zblas3_3m.c | 643 ++++++++++++++ 8 files changed, 5168 insertions(+), 615 deletions(-) create mode 100644 ctest/c_c3chke_3m.c create mode 100644 ctest/c_cblas3_3m.c create mode 100644 ctest/c_z3chke_3m.c create mode 100644 ctest/c_zblas3_3m.c diff --git a/ctest/c_c3chke.c b/ctest/c_c3chke.c index 4d5de5150..3b4764c4a 100644 --- a/ctest/c_c3chke.c +++ b/ctest/c_c3chke.c @@ -46,235 +46,7 @@ void F77_c3chke(char * rout) { } - if (strncmp( sf,"cblas_cgemm3m" ,13)==0) { - cblas_rout = "cblas_cgemm3" ; - - cblas_info = 1; - cblas_cgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_cgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_cgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_cgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 2; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 2; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 3; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 3; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 2, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - - } else if (strncmp( sf,"cblas_cgemm" ,11)==0) { + if (strncmp( sf,"cblas_cgemm" ,11)==0) { cblas_rout = "cblas_cgemm" ; diff --git a/ctest/c_c3chke_3m.c b/ctest/c_c3chke_3m.c new file mode 100644 index 000000000..4d5de5150 --- /dev/null +++ b/ctest/c_c3chke_3m.c @@ -0,0 +1,1936 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_c3chke(char * rout) { + char *sf = ( rout ) ; + float A[4] = {0.0,0.0,0.0,0.0}, + B[4] = {0.0,0.0,0.0,0.0}, + C[4] = {0.0,0.0,0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0, RBETA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + + if (strncmp( sf,"cblas_cgemm3m" ,13)==0) { + cblas_rout = "cblas_cgemm3" ; + + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cgemm" ,11)==0) { + cblas_rout = "cblas_cgemm" ; + + + cblas_info = 1; + cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_chemm" ,11)==0) { + cblas_rout = "cblas_chemm" ; + + cblas_info = 1; + cblas_chemm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csymm" ,11)==0) { + cblas_rout = "cblas_csymm" ; + + cblas_info = 1; + cblas_csymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ctrmm" ,11)==0) { + cblas_rout = "cblas_ctrmm" ; + + cblas_info = 1; + cblas_ctrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ctrsm" ,11)==0) { + cblas_rout = "cblas_ctrsm" ; + + cblas_info = 1; + cblas_ctrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cherk" ,11)==0) { + cblas_rout = "cblas_cherk" ; + + cblas_info = 1; + cblas_cherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csyrk" ,11)==0) { + cblas_rout = "cblas_csyrk" ; + + cblas_info = 1; + cblas_csyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cher2k" ,12)==0) { + cblas_rout = "cblas_cher2k" ; + + cblas_info = 1; + cblas_cher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csyr2k" ,12)==0) { + cblas_rout = "cblas_csyr2k" ; + + cblas_info = 1; + cblas_csyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } + + if (cblas_ok == 1 ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_cblas3.c b/ctest/c_cblas3.c index f1b108c64..9f48c49b1 100644 --- a/ctest/c_cblas3.c +++ b/ctest/c_cblas3.c @@ -567,81 +567,3 @@ void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, -void F77_cgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, - int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, - CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, - CBLAS_TEST_COMPLEX *c, int *ldc ) { - - CBLAS_TEST_COMPLEX *A, *B, *C; - int i,j,LDA, LDB, LDC; - enum CBLAS_TRANSPOSE transa, transb; - - get_transpose_type(transpa, &transa); - get_transpose_type(transpb, &transb); - - if (*order == TEST_ROW_MJR) { - if (transa == CblasNoTrans) { - LDA = *k+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); - for( i=0; i<*m; i++ ) - for( j=0; j<*k; j++ ) { - A[i*LDA+j].real=a[j*(*lda)+i].real; - A[i*LDA+j].imag=a[j*(*lda)+i].imag; - } - } - else { - LDA = *m+1; - A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); - for( i=0; i<*k; i++ ) - for( j=0; j<*m; j++ ) { - A[i*LDA+j].real=a[j*(*lda)+i].real; - A[i*LDA+j].imag=a[j*(*lda)+i].imag; - } - } - - if (transb == CblasNoTrans) { - LDB = *n+1; - B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); - for( i=0; i<*k; i++ ) - for( j=0; j<*n; j++ ) { - B[i*LDB+j].real=b[j*(*ldb)+i].real; - B[i*LDB+j].imag=b[j*(*ldb)+i].imag; - } - } - else { - LDB = *k+1; - B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); - for( i=0; i<*n; i++ ) - for( j=0; j<*k; j++ ) { - B[i*LDB+j].real=b[j*(*ldb)+i].real; - B[i*LDB+j].imag=b[j*(*ldb)+i].imag; - } - } - - LDC = *n+1; - C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); - for( j=0; j<*n; j++ ) - for( i=0; i<*m; i++ ) { - C[i*LDC+j].real=c[j*(*ldc)+i].real; - C[i*LDC+j].imag=c[j*(*ldc)+i].imag; - } - cblas_cgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, - B, LDB, beta, C, LDC ); - for( j=0; j<*n; j++ ) - for( i=0; i<*m; i++ ) { - c[j*(*ldc)+i].real=C[i*LDC+j].real; - c[j*(*ldc)+i].imag=C[i*LDC+j].imag; - } - free(A); - free(B); - free(C); - } - else if (*order == TEST_COL_MJR) - cblas_cgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, - b, *ldb, beta, c, *ldc ); - else - cblas_cgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, - b, *ldb, beta, c, *ldc ); -} - - diff --git a/ctest/c_cblas3_3m.c b/ctest/c_cblas3_3m.c new file mode 100644 index 000000000..f1b108c64 --- /dev/null +++ b/ctest/c_cblas3_3m.c @@ -0,0 +1,647 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 4/15/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} + +void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A= (CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} +void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX )); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} + +void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k, + float *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_COMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_COMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); + else + cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); +} +void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, float *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_COMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX )); + B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX )); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); + B=(CBLAS_TEST_COMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_COMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_COMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + +void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_COMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + + + +void F77_cgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_cgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} + + diff --git a/ctest/c_z3chke.c b/ctest/c_z3chke.c index 4be4457b4..054e72360 100644 --- a/ctest/c_z3chke.c +++ b/ctest/c_z3chke.c @@ -49,237 +49,7 @@ void F77_z3chke(char * rout) { - if (strncmp( sf,"cblas_zgemm3m" ,13)==0) { - cblas_rout = "cblas_zgemm3" ; - - cblas_info = 1; - cblas_zgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_zgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_zgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_zgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 2; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 2; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 3; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 3; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 2, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - - - - } else if (strncmp( sf,"cblas_zgemm" ,11)==0) { + if (strncmp( sf,"cblas_zgemm" ,11)==0) { cblas_rout = "cblas_zgemm" ; cblas_info = 1; diff --git a/ctest/c_z3chke_3m.c b/ctest/c_z3chke_3m.c new file mode 100644 index 000000000..4be4457b4 --- /dev/null +++ b/ctest/c_z3chke_3m.c @@ -0,0 +1,1940 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_z3chke(char * rout) { + char *sf = ( rout ) ; + double A[4] = {0.0,0.0,0.0,0.0}, + B[4] = {0.0,0.0,0.0,0.0}, + C[4] = {0.0,0.0,0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0, RBETA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + + + + + if (strncmp( sf,"cblas_zgemm3m" ,13)==0) { + cblas_rout = "cblas_zgemm3" ; + + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + + + } else if (strncmp( sf,"cblas_zgemm" ,11)==0) { + cblas_rout = "cblas_zgemm" ; + + cblas_info = 1; + cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zhemm" ,11)==0) { + cblas_rout = "cblas_zhemm" ; + + cblas_info = 1; + cblas_zhemm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsymm" ,11)==0) { + cblas_rout = "cblas_zsymm" ; + + cblas_info = 1; + cblas_zsymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ztrmm" ,11)==0) { + cblas_rout = "cblas_ztrmm" ; + + cblas_info = 1; + cblas_ztrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ztrsm" ,11)==0) { + cblas_rout = "cblas_ztrsm" ; + + cblas_info = 1; + cblas_ztrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zherk" ,11)==0) { + cblas_rout = "cblas_zherk" ; + + cblas_info = 1; + cblas_zherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsyrk" ,11)==0) { + cblas_rout = "cblas_zsyrk" ; + + cblas_info = 1; + cblas_zsyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zher2k" ,12)==0) { + cblas_rout = "cblas_zher2k" ; + + cblas_info = 1; + cblas_zher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsyr2k" ,12)==0) { + cblas_rout = "cblas_zsyr2k" ; + + cblas_info = 1; + cblas_zsyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } + + if (cblas_ok == 1 ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_zblas3.c b/ctest/c_zblas3.c index 46ff467d0..40afa4edf 100644 --- a/ctest/c_zblas3.c +++ b/ctest/c_zblas3.c @@ -564,80 +564,3 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, } -void F77_zgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, - int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, - CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, - CBLAS_TEST_ZOMPLEX *c, int *ldc ) { - - CBLAS_TEST_ZOMPLEX *A, *B, *C; - int i,j,LDA, LDB, LDC; - enum CBLAS_TRANSPOSE transa, transb; - - get_transpose_type(transpa, &transa); - get_transpose_type(transpb, &transb); - - if (*order == TEST_ROW_MJR) { - if (transa == CblasNoTrans) { - LDA = *k+1; - A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); - for( i=0; i<*m; i++ ) - for( j=0; j<*k; j++ ) { - A[i*LDA+j].real=a[j*(*lda)+i].real; - A[i*LDA+j].imag=a[j*(*lda)+i].imag; - } - } - else { - LDA = *m+1; - A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); - for( i=0; i<*k; i++ ) - for( j=0; j<*m; j++ ) { - A[i*LDA+j].real=a[j*(*lda)+i].real; - A[i*LDA+j].imag=a[j*(*lda)+i].imag; - } - } - - if (transb == CblasNoTrans) { - LDB = *n+1; - B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); - for( i=0; i<*k; i++ ) - for( j=0; j<*n; j++ ) { - B[i*LDB+j].real=b[j*(*ldb)+i].real; - B[i*LDB+j].imag=b[j*(*ldb)+i].imag; - } - } - else { - LDB = *k+1; - B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); - for( i=0; i<*n; i++ ) - for( j=0; j<*k; j++ ) { - B[i*LDB+j].real=b[j*(*ldb)+i].real; - B[i*LDB+j].imag=b[j*(*ldb)+i].imag; - } - } - - LDC = *n+1; - C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); - for( j=0; j<*n; j++ ) - for( i=0; i<*m; i++ ) { - C[i*LDC+j].real=c[j*(*ldc)+i].real; - C[i*LDC+j].imag=c[j*(*ldc)+i].imag; - } - cblas_zgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, - B, LDB, beta, C, LDC ); - for( j=0; j<*n; j++ ) - for( i=0; i<*m; i++ ) { - c[j*(*ldc)+i].real=C[i*LDC+j].real; - c[j*(*ldc)+i].imag=C[i*LDC+j].imag; - } - free(A); - free(B); - free(C); - } - else if (*order == TEST_COL_MJR) - cblas_zgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, - b, *ldb, beta, c, *ldc ); - else - cblas_zgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, - b, *ldb, beta, c, *ldc ); -} - diff --git a/ctest/c_zblas3_3m.c b/ctest/c_zblas3_3m.c new file mode 100644 index 000000000..46ff467d0 --- /dev/null +++ b/ctest/c_zblas3_3m.c @@ -0,0 +1,643 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 4/15/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} +void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} + +void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, + double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_ZOMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_ZOMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); + else + cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); +} +void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, double *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_ZOMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); + B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); + B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_ZOMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_ZOMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + +void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_ZOMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + + +void F77_zgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} + From 9bfd267d51e9d1b21a072a7f8e623e8554607bcc Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 5 May 2015 11:58:59 +0200 Subject: [PATCH 165/257] bugfix for gemm3m tests --- ctest/Makefile | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index 1d9567150..7a5d236aa 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -27,12 +27,18 @@ ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o +ctestl3o_3m = c_cblas3_3m.o c_c3chke_3m.o auxiliary.o c_xerbla.o constant.o + ztestl1o = c_zblas1.o ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o +ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o + + + all :: all1 all2 all3 all1: xscblat1 xdcblat1 xccblat1 xzcblat1 @@ -115,8 +121,8 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -xccblat3_3m: $(ctestl3o) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) # Double complex xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) @@ -127,8 +133,8 @@ xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -xzcblat3_3m: $(ztestl3o) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) include $(TOPDIR)/Makefile.tail From be3c843700376efb5a8be2526fa7e34042e68646 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 6 May 2015 09:21:19 +0200 Subject: [PATCH 166/257] added loops to trsm.c --- benchmark/trsm.c | 49 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/benchmark/trsm.c b/benchmark/trsm.c index ed969b707..9eae3380c 100644 --- a/benchmark/trsm.c +++ b/benchmark/trsm.c @@ -130,11 +130,21 @@ int main(int argc, char *argv[]){ char trans='N'; char diag ='U'; + + int l; + int loops = 1; + double timeg; + if ((p = getenv("OPENBLAS_SIDE"))) side=*p; if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + p = getenv("OPENBLAS_LOOPS"); + if ( p != NULL ) + loops = atoi(p); + + blasint m, i, j; int from = 1; @@ -150,7 +160,7 @@ int main(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} - fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c\n", from, to, step,side,uplo,trans,diag); + fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c Loops = %d\n", from, to, step,side,uplo,trans,diag,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); @@ -171,28 +181,35 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step) { - fprintf(stderr, " %6d : ", (int)m); + timeg=0.0; - for(j = 0; j < m; j++){ - for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } + fprintf(stderr, " %6d : ", (int)m); + + for (l=0; l Date: Wed, 6 May 2015 10:41:53 +0200 Subject: [PATCH 167/257] use only 1 thread if m or n < 2*GEMM_MULTITHREAD_THRESHOLD --- interface/trsm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/interface/trsm.c b/interface/trsm.c index 266372988..3d4aed282 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -362,6 +362,12 @@ void CNAME(enum CBLAS_ORDER order, mode |= (side << BLAS_RSIDE_SHIFT); args.nthreads = num_cpu_avail(3); + if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) + args.nthreads = 1; + else + if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) + args.nthreads = 1; + if (args.nthreads == 1) { #endif From cda29f183b9653671ea662506179b8827511934a Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Wed, 6 May 2015 21:52:34 -0400 Subject: [PATCH 168/257] Add vecLib benchmarks --- Makefile.system | 4 + benchmark/Makefile | 272 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 270 insertions(+), 6 deletions(-) diff --git a/Makefile.system b/Makefile.system index 525daa41b..201d1e23d 100644 --- a/Makefile.system +++ b/Makefile.system @@ -23,6 +23,7 @@ CC = gcc UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) CC = clang + EXTRALIB += -Wl,-no_compact_unwind endif endif @@ -557,6 +558,9 @@ FCOMMON_OPT += -Wall #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran +ifeq ($(UNAME_S),Darwin) +EXTRALIB += -L/usr/local/lib/gcc/4.9/ +endif endif ifdef NO_BINARY_MODE ifeq ($(ARCH), mips64) diff --git a/benchmark/Makefile b/benchmark/Makefile index 1d2e9ff66..1418fd522 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -30,7 +30,8 @@ LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf MKL=/home/saar/intel_mkl LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm - +# Apple vecLib +LIBVECLIB = -framework Accelerate goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ @@ -125,12 +126,34 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl +veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ + scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ + sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ + strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ + strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ + ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ + ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ + sger.veclib dger.veclib cger.veclib zger.veclib \ + sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ + sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ + sasum.veclib dasum.veclib casum.veclib zasum.veclib \ + ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ + chemv.veclib zhemv.veclib \ + chemm.veclib zhemm.veclib \ + cherk.veclib zherk.veclib \ + cher2k.veclib zher2k.veclib \ + sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ + sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ + sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ + spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib goto_3m :: cgemm3m.goto zgemm3m.goto -mkl_3m :: cgemm3m.mkl zgemm3m.mkl +mkl_3m :: cgemm3m.mkl zgemm3m.mkl -all :: goto mkl atlas acml +all :: goto mkl atlas acml veclib ##################################### Slinpack #################################################### slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) @@ -145,6 +168,9 @@ slinpack.atlas : slinpack.$(SUFFIX) slinpack.mkl : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +slinpack.veclib : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dlinpack #################################################### dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -158,6 +184,9 @@ dlinpack.atlas : dlinpack.$(SUFFIX) dlinpack.mkl : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dlinpack.veclib : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Clinpack #################################################### clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) @@ -172,6 +201,9 @@ clinpack.atlas : clinpack.$(SUFFIX) clinpack.mkl : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +clinpack.veclib : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zlinpack #################################################### zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) @@ -186,6 +218,9 @@ zlinpack.atlas : zlinpack.$(SUFFIX) zlinpack.mkl : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zlinpack.veclib : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Scholesky ################################################### scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) @@ -200,6 +235,9 @@ scholesky.atlas : scholesky.$(SUFFIX) scholesky.mkl : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +scholesky.veclib : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dcholesky ################################################### dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) @@ -214,6 +252,9 @@ dcholesky.atlas : dcholesky.$(SUFFIX) dcholesky.mkl : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dcholesky.veclib : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ccholesky ################################################### ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) @@ -228,6 +269,9 @@ ccholesky.atlas : ccholesky.$(SUFFIX) ccholesky.mkl : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ccholesky.veclib : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -245,6 +289,8 @@ zcholesky.atlas : zcholesky.$(SUFFIX) zcholesky.mkl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zcholesky.veclib : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemm #################################################### sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) @@ -259,6 +305,9 @@ sgemm.atlas : sgemm.$(SUFFIX) sgemm.mkl : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sgemm.veclib : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dgemm #################################################### dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -272,6 +321,9 @@ dgemm.atlas : dgemm.$(SUFFIX) dgemm.mkl : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dgemm.veclib : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgemm #################################################### cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) @@ -286,6 +338,9 @@ cgemm.atlas : cgemm.$(SUFFIX) cgemm.mkl : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgemm.veclib : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgemm #################################################### zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) @@ -300,6 +355,9 @@ zgemm.atlas : zgemm.$(SUFFIX) zgemm.mkl : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgemm.veclib : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssymm #################################################### ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -313,6 +371,9 @@ ssymm.atlas : ssymm.$(SUFFIX) ssymm.mkl : ssymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ssymm.veclib : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dsymm #################################################### dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -326,6 +387,9 @@ dsymm.atlas : dsymm.$(SUFFIX) dsymm.mkl : dsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dsymm.veclib : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Csymm #################################################### csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) @@ -340,6 +404,9 @@ csymm.atlas : csymm.$(SUFFIX) csymm.mkl : csymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +csymm.veclib : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zsymm #################################################### zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) @@ -354,6 +421,9 @@ zsymm.atlas : zsymm.$(SUFFIX) zsymm.mkl : zsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zsymm.veclib : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Strmm #################################################### strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -367,6 +437,9 @@ strmm.atlas : strmm.$(SUFFIX) strmm.mkl : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +strmm.veclib : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dtrmm #################################################### dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -380,6 +453,9 @@ dtrmm.atlas : dtrmm.$(SUFFIX) dtrmm.mkl : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dtrmm.veclib : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ctrmm #################################################### ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) @@ -394,6 +470,9 @@ ctrmm.atlas : ctrmm.$(SUFFIX) ctrmm.mkl : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ctrmm.veclib : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ztrmm #################################################### ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) @@ -408,6 +487,8 @@ ztrmm.atlas : ztrmm.$(SUFFIX) ztrmm.mkl : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ztrmm.veclib : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Strsm #################################################### strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) @@ -422,6 +503,9 @@ strsm.atlas : strsm.$(SUFFIX) strsm.mkl : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +strsm.veclib : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dtrsm #################################################### dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -435,6 +519,9 @@ dtrsm.atlas : dtrsm.$(SUFFIX) dtrsm.mkl : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dtrsm.veclib : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ctrsm #################################################### ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) @@ -449,6 +536,9 @@ ctrsm.atlas : ctrsm.$(SUFFIX) ctrsm.mkl : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ctrsm.veclib : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ztrsm #################################################### ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) @@ -463,6 +553,9 @@ ztrsm.atlas : ztrsm.$(SUFFIX) ztrsm.mkl : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ztrsm.veclib : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -476,6 +569,9 @@ ssyrk.atlas : ssyrk.$(SUFFIX) ssyrk.mkl : ssyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ssyrk.veclib : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dsyrk #################################################### dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -489,6 +585,9 @@ dsyrk.atlas : dsyrk.$(SUFFIX) dsyrk.mkl : dsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dsyrk.veclib : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Csyrk #################################################### csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) @@ -503,6 +602,9 @@ csyrk.atlas : csyrk.$(SUFFIX) csyrk.mkl : csyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +csyrk.veclib : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zsyrk #################################################### zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) @@ -517,6 +619,8 @@ zsyrk.atlas : zsyrk.$(SUFFIX) zsyrk.mkl : zsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zsyrk.veclib : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyr2k #################################################### ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) @@ -531,6 +635,9 @@ ssyr2k.atlas : ssyr2k.$(SUFFIX) ssyr2k.mkl : ssyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ssyr2k.veclib : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dsyr2k #################################################### dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -544,6 +651,9 @@ dsyr2k.atlas : dsyr2k.$(SUFFIX) dsyr2k.mkl : dsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dsyr2k.veclib : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Csyr2k #################################################### csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) @@ -558,6 +668,9 @@ csyr2k.atlas : csyr2k.$(SUFFIX) csyr2k.mkl : csyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +csyr2k.veclib : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zsyr2k #################################################### zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) @@ -572,6 +685,9 @@ zsyr2k.atlas : zsyr2k.$(SUFFIX) zsyr2k.mkl : zsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zsyr2k.veclib : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Chemm #################################################### chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) @@ -586,6 +702,9 @@ chemm.atlas : chemm.$(SUFFIX) chemm.mkl : chemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +chemm.veclib : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zhemm #################################################### zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) @@ -600,6 +719,9 @@ zhemm.atlas : zhemm.$(SUFFIX) zhemm.mkl : zhemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zhemm.veclib : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cherk #################################################### cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) @@ -614,6 +736,9 @@ cherk.atlas : cherk.$(SUFFIX) cherk.mkl : cherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cherk.veclib : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zherk #################################################### zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) @@ -628,6 +753,9 @@ zherk.atlas : zherk.$(SUFFIX) zherk.mkl : zherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zherk.veclib : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cher2k #################################################### cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) @@ -642,6 +770,9 @@ cher2k.atlas : cher2k.$(SUFFIX) cher2k.mkl : cher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cher2k.veclib : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zher2k #################################################### zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) @@ -656,6 +787,9 @@ zher2k.atlas : zher2k.$(SUFFIX) zher2k.mkl : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zher2k.veclib : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sgemv #################################################### sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -669,6 +803,9 @@ sgemv.atlas : sgemv.$(SUFFIX) sgemv.mkl : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sgemv.veclib : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dgemv #################################################### dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -682,6 +819,9 @@ dgemv.atlas : dgemv.$(SUFFIX) dgemv.mkl : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dgemv.veclib : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgemv #################################################### cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) @@ -696,6 +836,9 @@ cgemv.atlas : cgemv.$(SUFFIX) cgemv.mkl : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgemv.veclib : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgemv #################################################### zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) @@ -710,6 +853,9 @@ zgemv.atlas : zgemv.$(SUFFIX) zgemv.mkl : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgemv.veclib : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -723,6 +869,9 @@ sger.atlas : sger.$(SUFFIX) sger.mkl : sger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sger.veclib : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dger #################################################### dger.goto : dger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -736,6 +885,9 @@ dger.atlas : dger.$(SUFFIX) dger.mkl : dger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dger.veclib : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cger #################################################### cger.goto : cger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -749,6 +901,9 @@ cger.atlas : cger.$(SUFFIX) cger.mkl : cger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cger.veclib : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zger #################################################### zger.goto : zger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -762,6 +917,9 @@ zger.atlas : zger.$(SUFFIX) zger.mkl : zger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zger.veclib : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssymv #################################################### ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -775,6 +933,9 @@ ssymv.atlas : ssymv.$(SUFFIX) ssymv.mkl : ssymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ssymv.veclib : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dsymv #################################################### dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -788,6 +949,9 @@ dsymv.atlas : dsymv.$(SUFFIX) dsymv.mkl : dsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dsymv.veclib : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Csymv #################################################### csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -801,6 +965,9 @@ csymv.atlas : csymv.$(SUFFIX) csymv.mkl : csymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +csymv.veclib : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dsymv #################################################### zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -814,6 +981,9 @@ zsymv.atlas : zsymv.$(SUFFIX) zsymv.mkl : zsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zsymv.veclib : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sgeev #################################################### sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -827,6 +997,9 @@ sgeev.atlas : sgeev.$(SUFFIX) sgeev.mkl : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sgeev.veclib : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dgeev #################################################### dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -840,6 +1013,9 @@ dgeev.atlas : dgeev.$(SUFFIX) dgeev.mkl : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dgeev.veclib : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgeev #################################################### cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) @@ -854,6 +1030,9 @@ cgeev.atlas : cgeev.$(SUFFIX) cgeev.mkl : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgeev.veclib : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgeev #################################################### zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) @@ -868,6 +1047,8 @@ zgeev.atlas : zgeev.$(SUFFIX) zgeev.mkl : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgeev.veclib : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgetri #################################################### sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) @@ -882,6 +1063,9 @@ sgetri.atlas : sgetri.$(SUFFIX) sgetri.mkl : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sgetri.veclib : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dgetri #################################################### dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -895,6 +1079,9 @@ dgetri.atlas : dgetri.$(SUFFIX) dgetri.mkl : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dgetri.veclib : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgetri #################################################### cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) @@ -909,6 +1096,9 @@ cgetri.atlas : cgetri.$(SUFFIX) cgetri.mkl : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgetri.veclib : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgetri #################################################### zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) @@ -923,6 +1113,8 @@ zgetri.atlas : zgetri.$(SUFFIX) zgetri.mkl : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgetri.veclib : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Spotrf #################################################### spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) @@ -937,6 +1129,9 @@ spotrf.atlas : spotrf.$(SUFFIX) spotrf.mkl : spotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +spotrf.veclib : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dpotrf #################################################### dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -950,6 +1145,9 @@ dpotrf.atlas : dpotrf.$(SUFFIX) dpotrf.mkl : dpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dpotrf.veclib : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cpotrf #################################################### cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) @@ -964,6 +1162,9 @@ cpotrf.atlas : cpotrf.$(SUFFIX) cpotrf.mkl : cpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cpotrf.veclib : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zpotrf #################################################### zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) @@ -978,6 +1179,9 @@ zpotrf.atlas : zpotrf.$(SUFFIX) zpotrf.mkl : zpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zpotrf.veclib : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Chemv #################################################### chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) @@ -992,6 +1196,9 @@ chemv.atlas : chemv.$(SUFFIX) chemv.mkl : chemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +chemv.veclib : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zhemv #################################################### zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) @@ -1006,6 +1213,9 @@ zhemv.atlas : zhemv.$(SUFFIX) zhemv.mkl : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zhemv.veclib : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1019,6 +1229,9 @@ sdot.atlas : sdot.$(SUFFIX) sdot.mkl : sdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sdot.veclib : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ddot #################################################### ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1032,6 +1245,9 @@ ddot.atlas : ddot.$(SUFFIX) ddot.mkl : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ddot.veclib : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cdot #################################################### cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1045,6 +1261,9 @@ cdot.atlas : cdot.$(SUFFIX) cdot.mkl : cdot-intel.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cdot.veclib : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zdot #################################################### zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1058,6 +1277,9 @@ zdot.atlas : zdot.$(SUFFIX) zdot.mkl : zdot-intel.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zdot.veclib : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1071,6 +1293,9 @@ saxpy.atlas : saxpy.$(SUFFIX) saxpy.mkl : saxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +saxpy.veclib : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Daxpy #################################################### daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1084,6 +1309,9 @@ daxpy.atlas : daxpy.$(SUFFIX) daxpy.mkl : daxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +daxpy.veclib : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Caxpy #################################################### caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) @@ -1098,6 +1326,9 @@ caxpy.atlas : caxpy.$(SUFFIX) caxpy.mkl : caxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +caxpy.veclib : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zaxpy #################################################### zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) @@ -1112,6 +1343,9 @@ zaxpy.atlas : zaxpy.$(SUFFIX) zaxpy.mkl : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zaxpy.veclib : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sscal #################################################### sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1125,6 +1359,9 @@ sscal.atlas : sscal.$(SUFFIX) sscal.mkl : sscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sscal.veclib : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dscal #################################################### dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1138,6 +1375,9 @@ dscal.atlas : dscal.$(SUFFIX) dscal.mkl : dscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dscal.veclib : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cscal #################################################### cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) @@ -1152,6 +1392,9 @@ cscal.atlas : cscal.$(SUFFIX) cscal.mkl : cscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cscal.veclib : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zscal #################################################### zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) @@ -1166,6 +1409,9 @@ zscal.atlas : zscal.$(SUFFIX) zscal.mkl : zscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zscal.veclib : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sasum #################################################### sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1179,6 +1425,9 @@ sasum.atlas : sasum.$(SUFFIX) sasum.mkl : sasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sasum.veclib : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dasum #################################################### dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1192,6 +1441,9 @@ dasum.atlas : dasum.$(SUFFIX) dasum.mkl : dasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dasum.veclib : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Casum #################################################### casum.goto : casum.$(SUFFIX) ../$(LIBNAME) @@ -1206,6 +1458,9 @@ casum.atlas : casum.$(SUFFIX) casum.mkl : casum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +casum.veclib : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zasum #################################################### zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) @@ -1220,8 +1475,8 @@ zasum.atlas : zasum.$(SUFFIX) zasum.mkl : zasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - +zasum.veclib : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cgemm3m #################################################### @@ -1231,6 +1486,9 @@ cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) cgemm3m.mkl : cgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgemm3m.veclib : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgemm3m #################################################### zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) @@ -1239,6 +1497,8 @@ zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) zgemm3m.mkl : zgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgemm3m.veclib : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ################################################################################################### @@ -1502,7 +1762,7 @@ zgemm3m.$(SUFFIX) : gemm3m.c clean :: - @rm -f *.goto *.mkl *.acml *.atlas + @rm -f *.goto *.mkl *.acml *.atlas *.veclib include $(TOPDIR)/Makefile.tail From 6743beb7480d8e31e11211cb4fce5954749ee70e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 7 May 2015 13:06:53 +0800 Subject: [PATCH 169/257] Refs #565. Fix the bug of generate FEXTRALIB. --- Makefile.system | 5 +---- f_check | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Makefile.system b/Makefile.system index 201d1e23d..65294b906 100644 --- a/Makefile.system +++ b/Makefile.system @@ -23,7 +23,7 @@ CC = gcc UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) CC = clang - EXTRALIB += -Wl,-no_compact_unwind +# EXTRALIB += -Wl,-no_compact_unwind endif endif @@ -558,9 +558,6 @@ FCOMMON_OPT += -Wall #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran -ifeq ($(UNAME_S),Darwin) -EXTRALIB += -L/usr/local/lib/gcc/4.9/ -endif endif ifdef NO_BINARY_MODE ifeq ($(ARCH), mips64) diff --git a/f_check b/f_check index 5719faff1..7c6cc78ae 100644 --- a/f_check +++ b/f_check @@ -38,6 +38,7 @@ OUTER: foreach $path (@path) { if (-x $path . "/" . $lists) { $compiler = $lists; + $compiler_bin = $lists; last OUTER; } } From 8e5a1083bbfca6d93e3d35c1490311cbda675761 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 8 May 2015 05:33:17 +0800 Subject: [PATCH 170/257] Refs #532. Improve gemv paralel with small m and large n case. Splite the matrix and reduction. --- driver/level2/gemv_thread.c | 84 ++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c index ddd475367..061454848 100644 --- a/driver/level2/gemv_thread.c +++ b/driver/level2/gemv_thread.c @@ -62,6 +62,11 @@ #endif #endif +#ifndef TRANSA +#define Y_DUMMY_NUM 1024 +static FLOAT y_dummy[Y_DUMMY_NUM]; +#endif + static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; @@ -99,10 +104,15 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F a += n_from * lda * COMPSIZE; #ifdef TRANSA y += n_from * incy * COMPSIZE; +#else + //for split matrix row (n) direction and vector x of gemv_n + x += n_from * incx * COMPSIZE; + //store partial result for every thread + y += (m_to - m_from) * 1 * COMPSIZE * pos; #endif } - // fprintf(stderr, "M_From = %d M_To = %d N_From = %d N_To = %d\n", m_from, m_to, n_from, n_to); + //fprintf(stderr, "M_From = %d M_To = %d N_From = %d N_To = %d POS=%d\n", m_from, m_to, n_from, n_to, pos); GEMV(m_to - m_from, n_to - n_from, 0, *((FLOAT *)args -> alpha + 0), @@ -126,6 +136,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x BLASLONG width, i, num_cpu; +#ifndef TRANSA + int split_x=0; +#endif + #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE @@ -198,6 +212,58 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x i -= width; } +#ifndef TRANSA + //try to split matrix on row direction and x. + //Then, reduction. + if (num_cpu < nthreads) { + + //too small to split or bigger than the y_dummy buffer. + double MN = (double) m * (double) n; + if ( MN <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD)) + || m*COMPSIZE*nthreads > Y_DUMMY_NUM) + goto Outer; + + num_cpu = 0; + range[0] = 0; + + memset(y_dummy, 0, sizeof(FLOAT) * m * COMPSIZE * nthreads); + + args.ldc = 1; + args.c = (void *)y_dummy; + + //split on row (n) and x + i=n; + split_x=1; + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + if (width < 4) width = 4; + if (i < width) width = i; + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = gemv_kernel; + queue[num_cpu].args = &args; + + queue[num_cpu].position = num_cpu; + + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = &range[num_cpu]; + + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + + } + + Outer: +#endif + if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; @@ -206,5 +272,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x exec_blas(num_cpu, queue); } +#ifndef TRANSA + if(split_x==1){ + //reduction + for(i=0; i Date: Sun, 10 May 2015 00:10:26 -0700 Subject: [PATCH 171/257] Fix build with ALLOC_SHM=0 (Android NDK) Refactor such that you can build with ALLOC_SHM=0. HughTLB implicity depends on ALLOC_SHM=1. This patch allows building for Android NDK r10d. --- driver/others/memory.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 12172fd80..a9a80b8b4 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -709,8 +709,6 @@ static void *alloc_shm(void *address){ return map_address; } -#endif - #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS static void alloc_hugetlb_free(struct release_t *release){ @@ -817,6 +815,8 @@ static void *alloc_hugetlb(void *address){ } #endif +#endif + #ifdef ALLOC_HUGETLBFILE static int hugetlb_pid = 0; @@ -917,12 +917,13 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER alloc_devicedirver, #endif -#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS - alloc_hugetlb, -#endif +/* Hugetlb implicitly assumes ALLOC_SHM */ #ifdef ALLOC_SHM alloc_shm, #endif +#if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) + alloc_hugetlb, +#endif #ifdef ALLOC_MMAP alloc_mmap, #endif @@ -1062,7 +1063,7 @@ void *blas_memory_alloc(int procpos){ } #endif -#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; #endif From e50a9330374dba70d406d6be37ed65f46214621a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 12 May 2015 12:28:44 +0200 Subject: [PATCH 172/257] added optimized dscal kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 2 + kernel/x86_64/dscal.c | 143 ++++++++++++++++ kernel/x86_64/dscal_microk_bulldozer-2.c | 206 +++++++++++++++++++++++ 3 files changed, 351 insertions(+) create mode 100644 kernel/x86_64/dscal.c create mode 100644 kernel/x86_64/dscal_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index ef1108646..cd1665026 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,3 +1,5 @@ +DSCALKERNEL = dscal.c + DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c new file mode 100644 index 000000000..99001115b --- /dev/null +++ b/kernel/x86_64/dscal.c @@ -0,0 +1,143 @@ +/*************************************************************************** +Copyright (c) 2013 - 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(BULLDOZER) +#include "dscal_microk_bulldozer-2.c" +#endif + + +#if !defined(HAVE_KERNEL_8) + +void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) +{ + + BLASLONG i; + FLOAT alpha = *da; + + for( i=0; i 0 ) + { + if ( da == 0.0 ) + dscal_kernel_8_zero(n1 , &da , x); + else + dscal_kernel_8(n1 , &da , x); + } + + if ( da == 0.0 ) + { + for ( i=n1 ; i> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" + "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" + "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 256(%1) \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" + + "prefetcht0 320(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 5f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "vmovups %%xmm0 , -64(%1) \n\t" + "vmovups %%xmm0 , -48(%1) \n\t" + "vmovups %%xmm0 , -32(%1) \n\t" + "vmovups %%xmm0 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 7aee9139910a33a53718bd2c43db101caceefebe Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 12 May 2015 16:27:43 +0200 Subject: [PATCH 173/257] added optimized dscal kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 2 + kernel/x86_64/dscal.c | 2 + kernel/x86_64/dscal_microk_sandy-2.c | 206 +++++++++++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 kernel/x86_64/dscal_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index 129d7e5c4..ea81979ac 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,3 +1,5 @@ +DSCALKERNEL = dscal.c + SGERKERNEL = sger.c DGERKERNEL = dger.c diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 99001115b..be486a48e 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "dscal_microk_bulldozer-2.c" +#elif defined(SANDYBRIDGE) +#include "dscal_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c new file mode 100644 index 000000000..f5bf5932f --- /dev/null +++ b/kernel/x86_64/dscal_microk_sandy-2.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" + "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" + "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 640(%1) \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" + + "prefetcht0 704(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 5f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "vmovups %%xmm0 , -64(%1) \n\t" + "vmovups %%xmm0 , -48(%1) \n\t" + "vmovups %%xmm0 , -32(%1) \n\t" + "vmovups %%xmm0 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 02e772c7e42c24fd84169787aca88eb257a535d7 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 12 May 2015 17:19:58 +0200 Subject: [PATCH 174/257] added optimized dscal kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 + kernel/x86_64/dscal.c | 2 + kernel/x86_64/dscal_microk_haswell-2.c | 206 +++++++++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 kernel/x86_64/dscal_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 6849b05d9..188c51bf2 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,3 +1,5 @@ +DSCALKERNEL = dscal.c + SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index be486a48e..66a04ba8f 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" +#elif defined(HASWELL) +#include "dscal_microk_haswell-2.c" #endif diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c new file mode 100644 index 000000000..07a9c804c --- /dev/null +++ b/kernel/x86_64/dscal_microk_haswell-2.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" + "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" + "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + // "prefetcht0 640(%1) \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" + + // "prefetcht0 704(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 5f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "vmovups %%xmm0 , -64(%1) \n\t" + "vmovups %%xmm0 , -48(%1) \n\t" + "vmovups %%xmm0 , -32(%1) \n\t" + "vmovups %%xmm0 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 73f09bf64f9ae6a95a6ff5182d4b4262c95337d2 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 13 May 2015 12:14:39 +0200 Subject: [PATCH 175/257] optimized dscal kernel for increment != 1 --- kernel/x86_64/dscal.c | 91 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 66a04ba8f..d72a24b16 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -81,6 +81,77 @@ void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) #endif + +void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline)); + +void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ + + FLOAT *x1; + BLASLONG inc_x3; + + inc_x <<= 3; + inc_x3 = (inc_x << 1) + inc_x; + + __asm__ __volatile__ + ( + "movddup (%3), %%xmm0 \n\t" // alpha + + "leaq (%1,%4,4), %2 \n\t" + + ".align 16 \n\t" + + "1: \n\t" + "movsd (%1) , %%xmm4 \n\t" + "movhpd (%1,%4,1), %%xmm4 \n\t" + "movsd (%1,%4,2), %%xmm5 \n\t" + "movhpd (%1,%5,1), %%xmm5 \n\t" + + "movsd (%2) , %%xmm6 \n\t" + "movhpd (%2,%4,1), %%xmm6 \n\t" + "movsd (%2,%4,2), %%xmm7 \n\t" + "movhpd (%2,%5,1), %%xmm7 \n\t" + + "mulpd %%xmm0, %%xmm4 \n\t" + "mulpd %%xmm0, %%xmm5 \n\t" + "mulpd %%xmm0, %%xmm6 \n\t" + "mulpd %%xmm0, %%xmm7 \n\t" + + "movsd %%xmm4 , (%1) \n\t" + "movhpd %%xmm4 , (%1,%4,1) \n\t" + "movsd %%xmm5 , (%1,%4,2) \n\t" + "movhpd %%xmm5 , (%1,%5,1) \n\t" + + "movsd %%xmm6 , (%2) \n\t" + "movhpd %%xmm6 , (%2,%4,1) \n\t" + "movsd %%xmm7 , (%2,%4,2) \n\t" + "movhpd %%xmm7 , (%2,%5,1) \n\t" + + "leaq (%1,%4,8), %1 \n\t" + "leaq (%2,%4,8), %2 \n\t" + + "subq $8, %0 \n\t" + "jnz 1b \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (inc_x), // 4 + "r" (inc_x3) // 5 + : "cc", "%0", "%1", "%2", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0,j=0; @@ -91,6 +162,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { + BLASLONG n1 = n & -2; + + while(j < n1) + { + + x[i]=0.0; + x[i+inc_x]=0.0; + i += 2*inc_x ; + j+=2; + + } + while(j < n) { @@ -103,6 +186,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + dscal_kernel_inc_8(n1, &da, x, inc_x); + i = n1 * inc_x; + j = n1; + } + while(j < n) { From e00cccc41e2615f785cdd6bad45032a78a411564 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 13 May 2015 13:05:35 +0200 Subject: [PATCH 176/257] added optimized dscal kernel for piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 2 ++ kernel/x86_64/dscal.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index be8b629d9..7c4c7cd43 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -1,3 +1,5 @@ +DSCALKERNEL = dscal.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index d72a24b16..a425cb710 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" From 18e90ee2e390c75eba72a1d9d069da4229b62af5 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 13 May 2015 13:31:26 +0200 Subject: [PATCH 177/257] bugfix: added static to functions --- kernel/x86_64/dscal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index a425cb710..e3e2b0d58 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(HAVE_KERNEL_8) -void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) +static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) { BLASLONG i; @@ -60,7 +60,7 @@ void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) } -void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) { BLASLONG i; @@ -82,9 +82,9 @@ void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) #endif -void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline)); +static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline)); -void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) { FLOAT *x1; From 51ff17d46e0584a414e4ef97600f348877410adc Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 13 May 2015 16:16:30 -0500 Subject: [PATCH 178/257] Add AMD Excavator target. --- Makefile.system | 8 ++- TargetList.txt | 1 + common_x86.h | 2 +- common_x86_64.h | 2 +- cpuid.h | 2 + cpuid_x86.c | 41 +++++++++++---- driver/others/dynamic.c | 28 ++++++++--- getarch.c | 16 ++++++ kernel/x86_64/KERNEL.EXCAVATOR | 92 ++++++++++++++++++++++++++++++++++ param.h | 92 ++++++++++++++++++++++++++++++++++ 10 files changed, 265 insertions(+), 19 deletions(-) create mode 100644 kernel/x86_64/KERNEL.EXCAVATOR diff --git a/Makefile.system b/Makefile.system index 65294b906..78eeb121c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -65,6 +65,9 @@ endif ifeq ($(TARGET), STEAMROLLER) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET), EXCAVATOR) +GETARCH_FLAGS := -DFORCE_BARCELONA +endif endif @@ -92,6 +95,9 @@ endif ifeq ($(TARGET_CORE), STEAMROLLER) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET_CORE), EXCAVATOR) +GETARCH_FLAGS := -DFORCE_BARCELONA +endif endif @@ -409,7 +415,7 @@ endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) DYNAMIC_CORE += HASWELL diff --git a/TargetList.txt b/TargetList.txt index 1c985080b..0a9d8b40c 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -33,6 +33,7 @@ BOBCAT BULLDOZER PILEDRIVER STEAMROLLER +EXCAVATOR c)VIA CPU: SSE_GENERIC diff --git a/common_x86.h b/common_x86.h index 9d82090cc..99a723fd7 100644 --- a/common_x86.h +++ b/common_x86.h @@ -171,7 +171,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define MMXSTORE movd #endif -#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) +#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_x86_64.h b/common_x86_64.h index e0a6c4c42..efb902416 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -226,7 +226,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER -#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) +#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/cpuid.h b/cpuid.h index ab6a3fb32..8a8cdf6dd 100644 --- a/cpuid.h +++ b/cpuid.h @@ -109,6 +109,7 @@ #define CORE_PILEDRIVER 23 #define CORE_HASWELL 24 #define CORE_STEAMROLLER 25 +#define CORE_EXCAVATOR 26 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -203,5 +204,6 @@ typedef struct { #define CPUTYPE_PILEDRIVER 47 #define CPUTYPE_HASWELL 48 #define CPUTYPE_STEAMROLLER 49 +#define CPUTYPE_EXCAVATOR 50 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index aece9d871..4f97cfb5a 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1198,11 +1198,20 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; //OS don't support AVX. case 0: - if(support_avx()) - return CPUTYPE_STEAMROLLER; - else - return CPUTYPE_BARCELONA; //OS don't support AVX. - + switch(exmodel){ + case 3: + if(support_avx()) + return CPUTYPE_STEAMROLLER; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. + + case 6: + if(support_avx()) + return CPUTYPE_EXCAVATOR; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. + } + break; } break; case 5: @@ -1332,6 +1341,7 @@ static char *cpuname[] = { "PILEDRIVER", "HASWELL", "STEAMROLLER", + "EXCAVATOR", }; static char *lowercpuname[] = { @@ -1384,6 +1394,7 @@ static char *lowercpuname[] = { "piledriver", "haswell", "steamroller", + "excavator", }; static char *corename[] = { @@ -1413,6 +1424,7 @@ static char *corename[] = { "PILEDRIVER", "HASWELL", "STEAMROLLER", + "EXCAVATOR", }; static char *corename_lower[] = { @@ -1442,6 +1454,7 @@ static char *corename_lower[] = { "piledriver", "haswell", "steamroller", + "excavator", }; @@ -1644,10 +1657,20 @@ int get_coretype(void){ return CORE_BARCELONA; //OS don't support AVX. case 0: - if(support_avx()) - return CORE_STEAMROLLER; - else - return CORE_BARCELONA; //OS don't support AVX. + switch(exmodel){ + case 3: + if(support_avx()) + return CORE_STEAMROLLER; + else + return CORE_BARCELONA; //OS don't support AVX. + + case 6: + if(support_avx()) + return CORE_EXCAVATOR; + else + return CORE_BARCELONA; //OS don't support AVX. + } + break; } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 6945c17d4..ff80504f9 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -67,6 +67,7 @@ extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_PILEDRIVER; extern gotoblas_t gotoblas_STEAMROLLER; +extern gotoblas_t gotoblas_EXCAVATOR; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE #else @@ -79,6 +80,7 @@ extern gotoblas_t gotoblas_HASWELL; #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA +#define gotoblas_EXCAVATOR gotoblas_BARCELONA #endif @@ -307,12 +309,22 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } }else if(model == 0){ - //AMD STEAMROLLER - if(support_avx()) - return &gotoblas_STEAMROLLER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + if (exmodel == 3) { + //AMD STEAMROLLER + if(support_avx()) + return &gotoblas_STEAMROLLER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if (exmodel == 6) { + if(support_avx()) + return &gotoblas_EXCAVATOR; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + } } @@ -357,6 +369,7 @@ static char *corename[] = { "Piledriver", "Haswell", "Steamroller", + "Excavator", }; char *gotoblas_corename(void) { @@ -382,6 +395,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; if (gotoblas == &gotoblas_HASWELL) return corename[20]; if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; + if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; return corename[0]; } @@ -412,7 +426,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { - + case 22: return (&gotoblas_EXCAVATOR); case 21: return (&gotoblas_STEAMROLLER); case 20: return (&gotoblas_HASWELL); case 19: return (&gotoblas_PILEDRIVER); diff --git a/getarch.c b/getarch.c index ee5f55fd1..d6ecaeb62 100644 --- a/getarch.c +++ b/getarch.c @@ -448,6 +448,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "STEAMROLLER" #endif +#if defined (FORCE_EXCAVATOR) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "EXCAVATOR" +#define ARCHCONFIG "-DEXCAVATOR " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ + "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" +#define LIBNAME "excavator" +#define CORENAME "EXCAVATOR" +#endif + #ifdef FORCE_SSE_GENERIC #define FORCE diff --git a/kernel/x86_64/KERNEL.EXCAVATOR b/kernel/x86_64/KERNEL.EXCAVATOR new file mode 100644 index 000000000..dbdd1fe9b --- /dev/null +++ b/kernel/x86_64/KERNEL.EXCAVATOR @@ -0,0 +1,92 @@ +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + + +DSYMV_U_KERNEL = dsymv_U.c +DSYMV_L_KERNEL = dsymv_L.c +SSYMV_U_KERNEL = ssymv_U.c +SSYMV_L_KERNEL = ssymv_L.c + +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c + +DGEMVNKERNEL = dgemv_n_4.c +DGEMVTKERNEL = dgemv_t_4.c + +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_4.c + +DCOPYKERNEL = dcopy_bulldozer.S + + +SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = gemm_ncopy_2_bulldozer.S +SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = gemm_ncopy_2_bulldozer.S +DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S +DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + diff --git a/param.h b/param.h index 18c711eb3..245b678ef 100644 --- a/param.h +++ b/param.h @@ -499,6 +499,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef EXCAVATOR +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + + + +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 +#define GEMV_UNROLL 8 +#endif + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 576 +#define ZGEMM_DEFAULT_P 288 +#define CGEMM_DEFAULT_P 576 +#else +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#endif +#define QGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 160 +#define ZGEMM_DEFAULT_Q 160 +#define CGEMM_DEFAULT_Q 160 +#else +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#endif +#define QGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#define SGEMM_DEFAULT_R 12288 +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R 12288 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + #ifdef ATHLON #define SNUMOPT 4 From d63034303b59b53e3bd141ae304ad3889396ef58 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 16 May 2015 16:41:45 +0200 Subject: [PATCH 179/257] added optimized zscal kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/zscal.c | 430 +++++++++++++++++++++++++ kernel/x86_64/zscal_microk_haswell-2.c | 348 ++++++++++++++++++++ 3 files changed, 779 insertions(+) create mode 100644 kernel/x86_64/zscal.c create mode 100644 kernel/x86_64/zscal_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 188c51bf2..37e5b36a2 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,4 +1,5 @@ DSCALKERNEL = dscal.c +ZSCALKERNEL = zscal.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c new file mode 100644 index 000000000..84b80244d --- /dev/null +++ b/kernel/x86_64/zscal.c @@ -0,0 +1,430 @@ +/*************************************************************************** +Copyright (c) 2013 - 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(HASWELL) +#include "zscal_microk_haswell-2.c" +#endif + + +#if !defined(HAVE_KERNEL_8) + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) +{ + + BLASLONG i; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + FLOAT t0,t1,t2,t3; + + for( i=0; i 0 ) + { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1 ; + i = n1 * inc_x; + } + + while(j < n) + { + + temp0 = da_r * x[i] - da_i * x[i+1]; + x[i+1] = da_r * x[i+1] + da_i * x[i]; + x[i] = temp0; + i += inc_x ; + j++; + + } + + } + + } + + return(0); + } + + + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + + alpha[0] = da_r; + alpha[1] = da_i; + + if ( da_r == 0.0 ) + if ( da_i == 0 ) + zscal_kernel_8_zero(n1 , alpha , x); + else + zscal_kernel_8_zero_r(n1 , alpha , x); + else + if ( da_i == 0 ) + zscal_kernel_8_zero_i(n1 , alpha , x); + else + zscal_kernel_8(n1 , alpha , x); + + i = n1 << 1; + j = n1; + } + + + if ( da_r == 0.0 ) + { + + if ( da_i == 0.0 ) + { + + while(j < n) + { + + x[i]=0.0; + x[i+1]=0.0; + i += 2 ; + j++; + + } + + } + else + { + + while(j < n) + { + + temp0 = -da_i * x[i+1]; + x[i+1] = da_i * x[i]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + + } + else + { + + if ( da_i == 0.0 ) + { + + while(j < n) + { + + temp0 = da_r * x[i]; + x[i+1] = da_r * x[i+1]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + else + { + + while(j < n) + { + + temp0 = da_r * x[i] - da_i * x[i+1]; + x[i+1] = da_r * x[i+1] + da_i * x[i]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + + } + + return(0); +} + + diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c new file mode 100644 index 000000000..a93308ec4 --- /dev/null +++ b/kernel/x86_64/zscal_microk_haswell-2.c @@ -0,0 +1,348 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastsd (%2), %%ymm0 \n\t" // da_r + "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups 0(%1), %%ymm4 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastsd (%2), %%ymm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%ymm0 , -128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + From 7de6bb98892129b36e5b911efeaa53472ba67c80 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 17 May 2015 11:45:19 +0200 Subject: [PATCH 180/257] added optimized zscal kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/zscal.c | 2 + kernel/x86_64/zscal_microk_bulldozer-2.c | 348 +++++++++++++++++++++++ 3 files changed, 351 insertions(+) create mode 100644 kernel/x86_64/zscal_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index cd1665026..941ac8d4a 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,4 +1,5 @@ DSCALKERNEL = dscal.c +ZSCALKERNEL = zscal.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 84b80244d..0d875c55b 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "zscal_microk_haswell-2.c" +#elif defined(BULLDOZER) +#include "zscal_microk_bulldozer-2.c" #endif diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c new file mode 100644 index 000000000..28fe73480 --- /dev/null +++ b/kernel/x86_64/zscal_microk_bulldozer-2.c @@ -0,0 +1,348 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // da_r + "vmovddup 8(%2), %%xmm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" + "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" + "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" + "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" + + "subq $4 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 192(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups -64(%1), %%xmm4 \n\t" + "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%xmm12 , %%xmm8 , %%xmm8 \n\t" + "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubpd %%xmm13 , %%xmm9 , %%xmm9 \n\t" + "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm14 , %%xmm10, %%xmm10 \n\t" + "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm15 , %%xmm11, %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" + "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" + "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" + + "addq $64 ,%1 \n\t" + "subq $4 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" + + "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%xmm12 , %%xmm8 , %%xmm8 \n\t" + "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubpd %%xmm13 , %%xmm9 , %%xmm9 \n\t" + "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm14 , %%xmm10, %%xmm10 \n\t" + "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm15 , %%xmm11, %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vmovddup 8(%2), %%xmm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" + "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" + "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" + "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" + + "subq $4 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups -64(%1), %%xmm4 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%xmm12 , %%xmm0 , %%xmm8 \n\t" + "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubpd %%xmm13 , %%xmm0 , %%xmm9 \n\t" + "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm14 , %%xmm0 , %%xmm10 \n\t" + "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm15 , %%xmm0 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" + + "addq $64 ,%1 \n\t" + "subq $4 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%xmm12 , %%xmm0 , %%xmm8 \n\t" + "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubpd %%xmm13 , %%xmm0 , %%xmm9 \n\t" + "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm14 , %%xmm0 , %%xmm10 \n\t" + "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm15 , %%xmm0 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + + "subq $4 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups -64(%1), %%xmm4 \n\t" + "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "addq $64 ,%1 \n\t" + "subq $4 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%xmm0 , -128(%1) \n\t" + "vmovups %%xmm0 , -112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "addq $64 ,%1 \n\t" + "subq $4 , %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + From 31c9e399e91a5559a701d521b243f0ec43fec848 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 17 May 2015 13:44:09 +0200 Subject: [PATCH 181/257] added optimized cscal kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/cscal.c | 446 +++++++++++++++++++++++++ kernel/x86_64/cscal_microk_haswell-2.c | 348 +++++++++++++++++++ 3 files changed, 795 insertions(+) create mode 100644 kernel/x86_64/cscal.c create mode 100644 kernel/x86_64/cscal_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 37e5b36a2..d1f34cc7b 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,4 +1,5 @@ DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c SGEMVNKERNEL = sgemv_n_4.c diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c new file mode 100644 index 000000000..3b530f5b4 --- /dev/null +++ b/kernel/x86_64/cscal.c @@ -0,0 +1,446 @@ +/*************************************************************************** +Copyright (c) 2013 - 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(HASWELL) +#include "cscal_microk_haswell-2.c" +#endif + + +#if !defined(HAVE_KERNEL_16) + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha , FLOAT *x ) +{ + + BLASLONG i; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + FLOAT t0,t1,t2,t3; + + for( i=0; i 0 ) + { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1 ; + i = n1 * inc_x; + } + + while(j < n) + { + + temp0 = da_r * x[i] - da_i * x[i+1]; + x[i+1] = da_r * x[i+1] + da_i * x[i]; + x[i] = temp0; + i += inc_x ; + j++; + + } + + } + + } + + return(0); + } + + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + + alpha[0] = da_r; + alpha[1] = da_i; + + if ( da_r == 0.0 ) + if ( da_i == 0 ) + cscal_kernel_16_zero(n1 , alpha , x); + else + cscal_kernel_16_zero_r(n1 , alpha , x); + else + if ( da_i == 0 ) + cscal_kernel_16_zero_i(n1 , alpha , x); + else + cscal_kernel_16(n1 , alpha , x); + + i = n1 << 1; + j = n1; + } + + + if ( da_r == 0.0 ) + { + + if ( da_i == 0.0 ) + { + + while(j < n) + { + + x[i]=0.0; + x[i+1]=0.0; + i += 2 ; + j++; + + } + + } + else + { + + while(j < n) + { + + temp0 = -da_i * x[i+1]; + x[i+1] = da_i * x[i]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + + } + else + { + + if ( da_i == 0.0 ) + { + + while(j < n) + { + + temp0 = da_r * x[i]; + x[i+1] = da_r * x[i+1]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + else + { + + BLASLONG n2 = n & -2; + + while(j < n2) + { + + temp0 = da_r * x[i] - da_i * x[i+1]; + temp1 = da_r * x[i+2] - da_i * x[i+3]; + x[i+1] = da_r * x[i+1] + da_i * x[i]; + x[i+3] = da_r * x[i+3] + da_i * x[i+2]; + x[i] = temp0; + x[i+2] = temp1; + i += 4 ; + j+=2; + + } + + while(j < n) + { + + temp0 = da_r * x[i] - da_i * x[i+1]; + x[i+1] = da_r * x[i+1] + da_i * x[i]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + + } + + return(0); +} + + diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c new file mode 100644 index 000000000..0424de3a5 --- /dev/null +++ b/kernel/x86_64/cscal_microk_haswell-2.c @@ -0,0 +1,348 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_16 1 + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%ymm0 \n\t" // da_r + "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups 0(%1), %%ymm4 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%ymm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%ymm0 , -128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + From 59083e3ce14e0cd6539329bfcdb87484a84c81fc Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 18 May 2015 07:33:52 +0200 Subject: [PATCH 182/257] added optimized cscal kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/cscal.c | 2 + kernel/x86_64/cscal_microk_bulldozer-2.c | 348 +++++++++++++++++++++++ 3 files changed, 351 insertions(+) create mode 100644 kernel/x86_64/cscal_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 941ac8d4a..9f124c97f 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,4 +1,5 @@ DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c DAXPYKERNEL = daxpy.c diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 3b530f5b4..4785702fd 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "cscal_microk_haswell-2.c" +#elif defined(BULLDOZER) +#include "cscal_microk_bulldozer-2.c" #endif diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c new file mode 100644 index 000000000..f470cf843 --- /dev/null +++ b/kernel/x86_64/cscal_microk_bulldozer-2.c @@ -0,0 +1,348 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_16 1 + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%xmm0 \n\t" // da_r + "vbroadcastss 4(%2), %%xmm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" + "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 320(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups -64(%1), %%xmm4 \n\t" + "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%xmm12 , %%xmm8 , %%xmm8 \n\t" + "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%xmm13 , %%xmm9 , %%xmm9 \n\t" + "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm14 , %%xmm10, %%xmm10 \n\t" + "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubps %%xmm15 , %%xmm11, %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" + + "addq $64 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" + + "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%xmm12 , %%xmm8 , %%xmm8 \n\t" + "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%xmm13 , %%xmm9 , %%xmm9 \n\t" + "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm14 , %%xmm10, %%xmm10 \n\t" + "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubps %%xmm15 , %%xmm11, %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vbroadcastss 4(%2), %%xmm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" + "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups -64(%1), %%xmm4 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%xmm12 , %%xmm0 , %%xmm8 \n\t" + "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%xmm13 , %%xmm0 , %%xmm9 \n\t" + "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm14 , %%xmm0 , %%xmm10 \n\t" + "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubps %%xmm15 , %%xmm0 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" + + "addq $64 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%xmm12 , %%xmm0 , %%xmm8 \n\t" + "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%xmm13 , %%xmm0 , %%xmm9 \n\t" + "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm14 , %%xmm0 , %%xmm10 \n\t" + "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubps %%xmm15 , %%xmm0 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%xmm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups -64(%1), %%xmm4 \n\t" + "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "addq $64 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%xmm0 , -128(%1) \n\t" + "vmovups %%xmm0 , -112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "addq $64 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + From 2d9e4060507106617ded476e344fac50488d6edb Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 18 May 2015 08:46:06 +0200 Subject: [PATCH 183/257] added optimized cscal kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 1 + kernel/x86_64/cscal.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index ea81979ac..355d1e2f1 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,4 +1,5 @@ DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c SGERKERNEL = sger.c DGERKERNEL = dger.c diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 4785702fd..d9e27c55c 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) #include "cscal_microk_bulldozer-2.c" +#elif defined(SANDYBRIDGE) +#include "cscal_microk_bulldozer-2.c" #endif From 95b1faf667ef6b919ee5e1693d8643203806e10b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 18 May 2015 10:50:57 +0200 Subject: [PATCH 184/257] added optimized cscal and zscal kernels for steamroller and piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 3 +++ kernel/x86_64/KERNEL.STEAMROLLER | 4 ++++ kernel/x86_64/cscal.c | 2 +- kernel/x86_64/zscal.c | 2 +- 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 7c4c7cd43..5d3c7a2af 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -1,4 +1,7 @@ DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index dbdd1fe9b..51e6d616a 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -1,3 +1,7 @@ +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index d9e27c55c..8c8eba420 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "cscal_microk_haswell-2.c" -#elif defined(BULLDOZER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "cscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 0d875c55b..91bd44161 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "zscal_microk_haswell-2.c" -#elif defined(BULLDOZER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "zscal_microk_bulldozer-2.c" #endif From 24f58c8bb15530a3fb00ed43d187e7d2f0cd2fc7 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 18 May 2015 12:40:07 +0200 Subject: [PATCH 185/257] added optimized cscal and zscal kernels for steamroller --- kernel/x86_64/cscal.c | 4 +- kernel/x86_64/cscal_microk_steamroller-2.c | 349 +++++++++++++++++++++ kernel/x86_64/zscal.c | 4 +- kernel/x86_64/zscal_microk_steamroller-2.c | 349 +++++++++++++++++++++ 4 files changed, 704 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/cscal_microk_steamroller-2.c create mode 100644 kernel/x86_64/zscal_microk_steamroller-2.c diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 8c8eba420..5d86b1929 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "cscal_microk_haswell-2.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "cscal_microk_steamroller-2.c" #elif defined(SANDYBRIDGE) #include "cscal_microk_bulldozer-2.c" #endif diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c new file mode 100644 index 000000000..763e7add4 --- /dev/null +++ b/kernel/x86_64/cscal_microk_steamroller-2.c @@ -0,0 +1,349 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_16 1 + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%ymm0 \n\t" // da_r + "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "prefetcht0 768(%1) \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups 0(%1), %%ymm4 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%ymm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%ymm0 , -128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 91bd44161..a96766032 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "zscal_microk_haswell-2.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "zscal_microk_steamroller-2.c" #endif diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c new file mode 100644 index 000000000..d611bf570 --- /dev/null +++ b/kernel/x86_64/zscal_microk_steamroller-2.c @@ -0,0 +1,349 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastsd (%2), %%ymm0 \n\t" // da_r + "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 320(%1) \n\t" + "prefetcht0 384(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups 0(%1), %%ymm4 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastsd (%2), %%ymm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%ymm0 , -128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + From e127fb8fd8d444bafa174e435c2431a56632e077 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 19 May 2015 00:01:04 -0500 Subject: [PATCH 186/257] 1) Refs #575. Remove g77 from compiler list. 2) If OpenBLAS cannot find Fortran compiler, it will only build BLAS (without LAPACK). --- Makefile | 2 +- Makefile.system | 6 ++++++ exports/Makefile | 4 ++-- f_check | 14 +++++++------- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index f8e1345d5..6ad87d802 100644 --- a/Makefile +++ b/Makefile @@ -133,7 +133,7 @@ ifeq ($(CORE), UNKOWN) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) endif ifeq ($(NOFORTRAN), 1) - $(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.) + $(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.) endif ifeq ($(NO_STATIC), 1) ifeq ($(NO_SHARED), 1) diff --git a/Makefile.system b/Makefile.system index 78eeb121c..fcaa49a40 100644 --- a/Makefile.system +++ b/Makefile.system @@ -202,6 +202,12 @@ DLLWRAP = $(CROSS_SUFFIX)dllwrap OBJCOPY = $(CROSS_SUFFIX)objcopy OBJCONV = $(CROSS_SUFFIX)objconv + +# For detect fortran failed, only build BLAS. +ifeq ($(NOFORTRAN), 1) +NO_LAPACK = 1 +endif + # # OS dependent settings # diff --git a/exports/Makefile b/exports/Makefile index 1fdaf2213..177e975ea 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -100,8 +100,8 @@ else $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed $(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def endif -ifeq ($(NOFORTRAN), 2) -#only build cblas without Fortran +ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) +#only build without Fortran $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) diff --git a/f_check b/f_check index 7c6cc78ae..b7a48a628 100644 --- a/f_check +++ b/f_check @@ -3,11 +3,11 @@ # # 1. Not specified # 1.1 Automatically detect, then check compiler -# 1.2 If no fortran compiler is detected, g77 is default with NOFORTRAN definition +# 1.2 If no fortran compiler is detected, gfortran is default with NOFORTRAN definition # 2. Specified # 2.1 If path is correct, check compiler # 2.2 If path is not correct, but still valid compiler name, force setting -# 2.2.2 Path is not correct, invalid compiler name, then g77 is default with NOFORTRAN definition +# 2.2.2 Path is not correct, invalid compiler name, then gfortran is default with NOFORTRAN definition # $makefile = shift(@ARGV); @@ -25,7 +25,7 @@ $compiler = "" if $compiler eq "f77"; if ($compiler eq "") { - @lists = ("g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95", + @lists = ("g95", "gfortran", "frt", "fort", "openf90", "openf95", "sunf77", "sunf90", "sunf95", "xlf95", "xlf90", "xlf", "ppuf77", "ppuf95", "ppuf90", "ppuxlf", @@ -49,8 +49,8 @@ OUTER: if ($compiler eq "") { $nofortran = 1; - $compiler = "g77"; - $vendor = G77; + $compiler = "gfortran"; + $vendor = GFORTRAN; $bu = "_"; } else { @@ -197,8 +197,8 @@ if ($compiler eq "") { if ($vendor eq "") { $nofortran = 1; - $compiler = "g77"; - $vendor = G77; + $compiler = "gfortran"; + $vendor = GFORTRAN; $bu = "_"; $openmp = ""; } From ea4df0aad3f4f78bdfe00826c2f66ca310084050 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 19 May 2015 10:43:12 +0200 Subject: [PATCH 187/257] Ref #574: Bugfix for armv6 memory barrier --- common_arm.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/common_arm.h b/common_arm.h index eb4234b05..4acbc7061 100644 --- a/common_arm.h +++ b/common_arm.h @@ -71,9 +71,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef COMMON_ARM #define COMMON_ARM +#if defined(ARMV5) || defined(ARMV6) + +#define MB +#define WMB + +#else + #define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") +#endif + #define INLINE inline #define RETURN_BY_COMPLEX From 6d40fa587fe13f4f0ec6b0c300d976200af15d41 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 19 May 2015 12:04:45 -0500 Subject: [PATCH 188/257] Fix f_check bug. --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index b7a48a628..e7e46886f 100644 --- a/f_check +++ b/f_check @@ -25,7 +25,7 @@ $compiler = "" if $compiler eq "f77"; if ($compiler eq "") { - @lists = ("g95", "gfortran", "frt", "fort", "openf90", "openf95", + @lists = ("gfortran", "g95", "frt", "fort", "openf90", "openf95", "sunf77", "sunf90", "sunf95", "xlf95", "xlf90", "xlf", "ppuf77", "ppuf95", "ppuf90", "ppuxlf", From 23fbc5728ed284e734ad9b1211e1cd6c052454f9 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 20 May 2015 11:05:00 +0200 Subject: [PATCH 189/257] added blas level1 copy benchmark --- benchmark/Makefile | 85 +++++++++++++++++ benchmark/copy.c | 201 +++++++++++++++++++++++++++++++++++++++ benchmark/plot-filter.sh | 5 + 3 files changed, 291 insertions(+) create mode 100644 benchmark/copy.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 1418fd522..2f69196e3 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -43,6 +43,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sscal.goto dscal.goto cscal.goto zscal.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ @@ -66,6 +67,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml cdot.acml zdot.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sscal.acml dscal.acml cscal.acml zscal.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ @@ -89,6 +91,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ @@ -113,6 +116,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ @@ -136,6 +140,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ + scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ sasum.veclib dasum.veclib casum.veclib zasum.veclib \ ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ @@ -1346,6 +1351,73 @@ zaxpy.mkl : zaxpy.$(SUFFIX) zaxpy.veclib : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +scopy.acml : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.atlas : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.mkl : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.veclib : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +dcopy.acml : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.atlas : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.mkl : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.veclib : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +ccopy.acml : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.atlas : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.mkl : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.veclib : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zcopy.acml : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.atlas : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.mkl : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.veclib : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sscal #################################################### sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1727,6 +1799,19 @@ caxpy.$(SUFFIX) : axpy.c zaxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + sscal.$(SUFFIX) : scal.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/copy.c b/benchmark/copy.c new file mode 100644 index 000000000..15c45201c --- /dev/null +++ b/benchmark/copy.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef COPY + +#ifdef COMPLEX +#ifdef DOUBLE +#define COPY BLASFUNC(zcopy) +#else +#define COPY BLASFUNC(ccopy) +#endif +#else +#ifdef DOUBLE +#define COPY BLASFUNC(dcopy) +#else +#define COPY BLASFUNC(scopy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Wed, 20 May 2015 21:57:27 -0500 Subject: [PATCH 190/257] Support Android NDK armeabi-v7a-hard ABI. (-mfloat-abi=hard) e.g. make HOSTCC=gcc CC=arm-linux-androideabi-gcc NO_LAPACK=1 TARGET=ARMV7 In Android NDK, it uses armeabi-v7a-hard ABI. TARGET_CFLAGS += -mhard-float -D_NDK_MATH_NO_SOFTFP=1 TARGET_LDFLAGS += -Wl,--no-warn-mismatch -lm_hard For more information, please check hard-float example at android_ndk/tests/device/hard-float/jni/. --- Makefile.arm | 10 ++++++++++ c_check | 1 + common.h | 6 ++++++ ctest.c | 4 ++++ driver/others/memory.c | 18 ++++++++++++++---- 5 files changed, 35 insertions(+), 4 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index 9978a672a..2f7b33730 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,13 +1,23 @@ # ifeq logical or ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) +ifeq ($(OSNAME), Android) +CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a +FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a +else CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a endif +endif ifeq ($(CORE), ARMV7) +ifeq ($(OSNAME), Android) +CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a +FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a +else CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a endif +endif ifeq ($(CORE), ARMV6) CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 diff --git a/c_check b/c_check index 99de07067..0fdadb659 100644 --- a/c_check +++ b/c_check @@ -57,6 +57,7 @@ $os = osf if ($data =~ /OS_OSF/); $os = WINNT if ($data =~ /OS_WINNT/); $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = Interix if ($data =~ /OS_INTERIX/); +$os = Android if ($data =~ /OS_ANDROID/); $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); diff --git a/common.h b/common.h index cf25fd5b0..320adadcb 100644 --- a/common.h +++ b/common.h @@ -93,6 +93,10 @@ extern "C" { #include #endif +#ifdef OS_ANDROID +#define NO_SYSV_IPC +#endif + #ifdef OS_WINDOWS #ifdef ATOM #define GOTO_ATOM ATOM @@ -106,7 +110,9 @@ extern "C" { #endif #else #include +#ifndef NO_SYSV_IPC #include +#endif #include #include #include diff --git a/ctest.c b/ctest.c index d5c224726..b5c74f137 100644 --- a/ctest.c +++ b/ctest.c @@ -44,6 +44,10 @@ COMPILER_DEC COMPILER_GNU #endif +#if defined(__ANDROID__) +OS_ANDROID +#endif + #if defined(__linux__) OS_LINUX #endif diff --git a/driver/others/memory.c b/driver/others/memory.c index a9a80b8b4..a562da377 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef OS_WINDOWS #include +#ifndef NO_SYSV_IPC #include +#endif #include #endif @@ -169,6 +171,14 @@ int get_num_procs(void) { #endif #endif +#ifdef OS_ANDROID +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN); + return nums; +} +#endif + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -266,7 +276,7 @@ void openblas_fork_handler() // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 // In the mean time build with USE_OPENMP=0 or link against another // implementation of OpenMP. -#if !defined(OS_WINDOWS) && defined(SMP_SERVER) +#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); if(err != 0) @@ -276,7 +286,7 @@ void openblas_fork_handler() int blas_get_cpu_number(void){ env_var_t p; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -284,7 +294,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -308,7 +318,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif From c47c8e8cf5422bf34c69ba3809053a4432b5d0b9 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 21 May 2015 08:51:42 +0200 Subject: [PATCH 191/257] added blas level1 swap benchmark --- benchmark/Makefile | 85 +++++++++++++++++++ benchmark/swap.c | 201 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 286 insertions(+) create mode 100644 benchmark/swap.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 2f69196e3..ac80e6822 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -44,6 +44,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ sscal.goto dscal.goto cscal.goto zscal.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ @@ -68,6 +69,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sdot.acml ddot.acml cdot.acml zdot.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ sscal.acml dscal.acml cscal.acml zscal.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ @@ -92,6 +94,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sdot.atlas ddot.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ @@ -117,6 +120,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ @@ -141,6 +145,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ + sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ sasum.veclib dasum.veclib casum.veclib zasum.veclib \ ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ @@ -1550,6 +1555,73 @@ zasum.mkl : zasum.$(SUFFIX) zasum.veclib : zasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +sswap.acml : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.atlas : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.mkl : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.veclib : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +dswap.acml : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.atlas : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.mkl : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.veclib : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +cswap.acml : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.atlas : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.mkl : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.veclib : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zswap.acml : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.atlas : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.mkl : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.veclib : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ##################################### Cgemm3m #################################################### cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) @@ -1811,6 +1883,19 @@ ccopy.$(SUFFIX) : copy.c zcopy.$(SUFFIX) : copy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + sscal.$(SUFFIX) : scal.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/swap.c b/benchmark/swap.c new file mode 100644 index 000000000..9f108ef50 --- /dev/null +++ b/benchmark/swap.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above swapright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above swapright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE SWAPRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SWAP + +#ifdef COMPLEX +#ifdef DOUBLE +#define SWAP BLASFUNC(zswap) +#else +#define SWAP BLASFUNC(cswap) +#endif +#else +#ifdef DOUBLE +#define SWAP BLASFUNC(dswap) +#else +#define SWAP BLASFUNC(sswap) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Sat, 23 May 2015 10:58:38 +0200 Subject: [PATCH 192/257] smp lock bugfix --- common_arm64.h | 55 ++++++++++---------------------------------------- 1 file changed, 11 insertions(+), 44 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index ae79c5309..ee13566f8 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2014, The OpenBLAS Project +Copyright (c) 2011-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,49 +30,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - #ifndef COMMON_ARM64 #define COMMON_ARM64 -#define MB -#define WMB +#define MB __asm__ __volatile__ ("dmb ish" : : : "memory") +#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") + #define INLINE inline @@ -81,17 +44,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER static void __inline blas_lock(volatile BLASULONG *address){ -/* + int register ret; do { while (*address) {YIELDING;}; __asm__ __volatile__( + "1: \n\t" "ldrex r2, [%1] \n\t" "mov r2, #0 \n\t" "strex r3, r2, [%1] \n\t" - "mov %0 , r3 \n\t" + "cmp r3, #0 \n\t" + "bne 1b \n\t" + "mov %0 , r3 \n\t" : "=r"(ret), "=r"(address) : "1"(address) : "memory", "r2" , "r3" @@ -100,7 +66,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ ); } while (ret); -*/ + } @@ -166,3 +132,4 @@ REALNAME: #endif #endif + From eea2e30b744e5c905cb9107675a86feecbf63977 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 23 May 2015 11:40:40 +0200 Subject: [PATCH 193/257] bugfix for arm locking --- common_arm.h | 40 +--------------------------------------- 1 file changed, 1 insertion(+), 39 deletions(-) diff --git a/common_arm.h b/common_arm.h index 4acbc7061..135191057 100644 --- a/common_arm.h +++ b/common_arm.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2014, The OpenBLAS Project +Copyright (c) 2011-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,44 +30,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - #ifndef COMMON_ARM #define COMMON_ARM From 02c7766f687c586359f3e698ef4aede735cf626c Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 29 May 2015 12:56:22 +0200 Subject: [PATCH 194/257] bugfixes, to build benchmarks with mingw on Windows OS --- benchmark/Make_exe.sh | 9 ++++ benchmark/Makefile | 117 +++++++++++++++++++++++++++++++++++++++++- benchmark/cholesky.c | 7 +++ benchmark/potrf.c | 4 ++ 4 files changed, 136 insertions(+), 1 deletion(-) create mode 100755 benchmark/Make_exe.sh diff --git a/benchmark/Make_exe.sh b/benchmark/Make_exe.sh new file mode 100755 index 000000000..4304f6fb3 --- /dev/null +++ b/benchmark/Make_exe.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +for f in *.goto *.acml *.mkl *.atlas +do + if [ -f "$f" ]; then + mv $f `echo $f|tr '.' '_'`.exe + fi +done + diff --git a/benchmark/Makefile b/benchmark/Makefile index ac80e6822..7af9b860e 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -33,6 +33,111 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread # Apple vecLib LIBVECLIB = -framework Accelerate +ifeq ($(OSNAME), WINNT) + +goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ + scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ + sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ + strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ + strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ + ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ + sger.goto dger.goto cger.goto zger.goto \ + sdot.goto ddot.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ + ssymv.goto dsymv.goto csymv.goto zsymv.goto \ + chemv.goto zhemv.goto \ + chemm.goto zhemm.goto \ + cherk.goto zherk.goto \ + cher2k.goto zher2k.goto \ + sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ + spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ + ssymm.goto dsymm.goto csymm.goto zsymm.goto + +acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ + scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ + sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ + strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ + strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ + ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ + sger.acml dger.acml cger.acml zger.acml \ + sdot.acml ddot.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ + chemm.acml zhemm.acml \ + cherk.acml zherk.acml \ + cher2k.acml zher2k.acml \ + sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ + spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ + ssymm.acml dsymm.acml csymm.acml zsymm.acml + +atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ + scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ + sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ + strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ + strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ + ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ + sdot.atlas ddot.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ + chemm.acml zhemm.acml \ + chemm.atlas zhemm.atlas \ + cherk.atlas zherk.atlas \ + cher2k.atlas zher2k.atlas \ + sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ + spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas + +mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ + scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ + sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ + strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ + strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ + ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ + chemm.mkl zhemm.mkl \ + cherk.mkl zherk.mkl \ + cher2k.mkl zher2k.mkl \ + sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ + spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + +else + goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ @@ -66,7 +171,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ - sdot.acml ddot.acml cdot.acml zdot.acml \ + sdot.acml ddot.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -134,6 +239,13 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + + + +endif + + + veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ @@ -165,6 +277,9 @@ mkl_3m :: cgemm3m.mkl zgemm3m.mkl all :: goto mkl atlas acml veclib +exe : + @./Make_exe.sh + ##################################### Slinpack #################################################### slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c index c8b96d80f..8d121efb3 100644 --- a/benchmark/cholesky.c +++ b/benchmark/cholesky.c @@ -71,8 +71,14 @@ double fabs(double); #endif #endif + + #if defined(__WIN32__) || defined(__WIN64__) +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; @@ -99,6 +105,7 @@ int gettimeofday(struct timeval *tv, void *tz){ #endif + static __inline double getmflops(int ratio, int m, double secs){ double mm = (double)m; diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 3caf61caa..1d714549b 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -88,6 +88,10 @@ double fabs(double); #if defined(__WIN32__) || defined(__WIN64__) +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; From 9c4817d07b116486b8824bf2bcf77343e5014dd7 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 31 May 2015 14:16:51 +0200 Subject: [PATCH 195/257] bugfix for Makefile on mac --- benchmark/Makefile | 184 ++++++++++++++++++++++----------------------- 1 file changed, 91 insertions(+), 93 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 7af9b860e..01847f839 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -282,7 +282,7 @@ exe : ##################################### Slinpack #################################################### slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm slinpack.acml : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -298,7 +298,7 @@ slinpack.veclib : slinpack.$(SUFFIX) ##################################### Dlinpack #################################################### dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dlinpack.acml : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -315,7 +315,7 @@ dlinpack.veclib : dlinpack.$(SUFFIX) ##################################### Clinpack #################################################### clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm clinpack.acml : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -332,7 +332,7 @@ clinpack.veclib : clinpack.$(SUFFIX) ##################################### Zlinpack #################################################### zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zlinpack.acml : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -349,7 +349,7 @@ zlinpack.veclib : zlinpack.$(SUFFIX) ##################################### Scholesky ################################################### scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm scholesky.acml : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -366,7 +366,7 @@ scholesky.veclib : scholesky.$(SUFFIX) ##################################### Dcholesky ################################################### dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dcholesky.acml : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -383,7 +383,7 @@ dcholesky.veclib : dcholesky.$(SUFFIX) ##################################### Ccholesky ################################################### ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ccholesky.acml : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -397,13 +397,11 @@ ccholesky.mkl : ccholesky.$(SUFFIX) ccholesky.veclib : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm ##################################### Zcholesky ################################################### -xcholesky.goto : xcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm +zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zcholesky.acml : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -419,7 +417,7 @@ zcholesky.veclib : zcholesky.$(SUFFIX) ##################################### Sgemm #################################################### sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgemm.acml : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -435,7 +433,7 @@ sgemm.veclib : sgemm.$(SUFFIX) ##################################### Dgemm #################################################### dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgemm.acml : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -452,7 +450,7 @@ dgemm.veclib : dgemm.$(SUFFIX) ##################################### Cgemm #################################################### cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgemm.acml : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -469,7 +467,7 @@ cgemm.veclib : cgemm.$(SUFFIX) ##################################### Zgemm #################################################### zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgemm.acml : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -485,7 +483,7 @@ zgemm.veclib : zgemm.$(SUFFIX) ##################################### Ssymm #################################################### ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssymm.acml : ssymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -501,7 +499,7 @@ ssymm.veclib : ssymm.$(SUFFIX) ##################################### Dsymm #################################################### dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsymm.acml : dsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -518,7 +516,7 @@ dsymm.veclib : dsymm.$(SUFFIX) ##################################### Csymm #################################################### csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csymm.acml : csymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -535,7 +533,7 @@ csymm.veclib : csymm.$(SUFFIX) ##################################### Zsymm #################################################### zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsymm.acml : zsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -551,7 +549,7 @@ zsymm.veclib : zsymm.$(SUFFIX) ##################################### Strmm #################################################### strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm strmm.acml : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -567,7 +565,7 @@ strmm.veclib : strmm.$(SUFFIX) ##################################### Dtrmm #################################################### dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dtrmm.acml : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -584,7 +582,7 @@ dtrmm.veclib : dtrmm.$(SUFFIX) ##################################### Ctrmm #################################################### ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ctrmm.acml : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -601,7 +599,7 @@ ctrmm.veclib : ctrmm.$(SUFFIX) ##################################### Ztrmm #################################################### ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ztrmm.acml : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -617,7 +615,7 @@ ztrmm.veclib : ztrmm.$(SUFFIX) ##################################### Strsm #################################################### strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm strsm.acml : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -633,7 +631,7 @@ strsm.veclib : strsm.$(SUFFIX) ##################################### Dtrsm #################################################### dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dtrsm.acml : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -650,7 +648,7 @@ dtrsm.veclib : dtrsm.$(SUFFIX) ##################################### Ctrsm #################################################### ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ctrsm.acml : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -667,7 +665,7 @@ ctrsm.veclib : ctrsm.$(SUFFIX) ##################################### Ztrsm #################################################### ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ztrsm.acml : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -683,7 +681,7 @@ ztrsm.veclib : ztrsm.$(SUFFIX) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssyrk.acml : ssyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -699,7 +697,7 @@ ssyrk.veclib : ssyrk.$(SUFFIX) ##################################### Dsyrk #################################################### dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsyrk.acml : dsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -716,7 +714,7 @@ dsyrk.veclib : dsyrk.$(SUFFIX) ##################################### Csyrk #################################################### csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csyrk.acml : csyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -733,7 +731,7 @@ csyrk.veclib : csyrk.$(SUFFIX) ##################################### Zsyrk #################################################### zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsyrk.acml : zsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -749,7 +747,7 @@ zsyrk.veclib : zsyrk.$(SUFFIX) ##################################### Ssyr2k #################################################### ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssyr2k.acml : ssyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -765,7 +763,7 @@ ssyr2k.veclib : ssyr2k.$(SUFFIX) ##################################### Dsyr2k #################################################### dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsyr2k.acml : dsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -782,7 +780,7 @@ dsyr2k.veclib : dsyr2k.$(SUFFIX) ##################################### Csyr2k #################################################### csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csyr2k.acml : csyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -799,7 +797,7 @@ csyr2k.veclib : csyr2k.$(SUFFIX) ##################################### Zsyr2k #################################################### zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsyr2k.acml : zsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -816,7 +814,7 @@ zsyr2k.veclib : zsyr2k.$(SUFFIX) ##################################### Chemm #################################################### chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm chemm.acml : chemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -833,7 +831,7 @@ chemm.veclib : chemm.$(SUFFIX) ##################################### Zhemm #################################################### zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zhemm.acml : zhemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -850,7 +848,7 @@ zhemm.veclib : zhemm.$(SUFFIX) ##################################### Cherk #################################################### cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cherk.acml : cherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -867,7 +865,7 @@ cherk.veclib : cherk.$(SUFFIX) ##################################### Zherk #################################################### zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zherk.acml : zherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -884,7 +882,7 @@ zherk.veclib : zherk.$(SUFFIX) ##################################### Cher2k #################################################### cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cher2k.acml : cher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -901,7 +899,7 @@ cher2k.veclib : cher2k.$(SUFFIX) ##################################### Zher2k #################################################### zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zher2k.acml : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -917,7 +915,7 @@ zher2k.veclib : zher2k.$(SUFFIX) ##################################### Sgemv #################################################### sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgemv.acml : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -933,7 +931,7 @@ sgemv.veclib : sgemv.$(SUFFIX) ##################################### Dgemv #################################################### dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgemv.acml : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -950,7 +948,7 @@ dgemv.veclib : dgemv.$(SUFFIX) ##################################### Cgemv #################################################### cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgemv.acml : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -967,7 +965,7 @@ cgemv.veclib : cgemv.$(SUFFIX) ##################################### Zgemv #################################################### zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgemv.acml : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -983,7 +981,7 @@ zgemv.veclib : zgemv.$(SUFFIX) ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sger.acml : sger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -999,7 +997,7 @@ sger.veclib : sger.$(SUFFIX) ##################################### Dger #################################################### dger.goto : dger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dger.acml : dger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1015,7 +1013,7 @@ dger.veclib : dger.$(SUFFIX) ##################################### Cger #################################################### cger.goto : cger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cger.acml : cger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1031,7 +1029,7 @@ cger.veclib : cger.$(SUFFIX) ##################################### Zger #################################################### zger.goto : zger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zger.acml : zger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1047,7 +1045,7 @@ zger.veclib : zger.$(SUFFIX) ##################################### Ssymv #################################################### ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssymv.acml : ssymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1063,7 +1061,7 @@ ssymv.veclib : ssymv.$(SUFFIX) ##################################### Dsymv #################################################### dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsymv.acml : dsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1079,7 +1077,7 @@ dsymv.veclib : dsymv.$(SUFFIX) ##################################### Csymv #################################################### csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csymv.acml : csymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1095,7 +1093,7 @@ csymv.veclib : csymv.$(SUFFIX) ##################################### Dsymv #################################################### zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsymv.acml : zsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1111,7 +1109,7 @@ zsymv.veclib : zsymv.$(SUFFIX) ##################################### Sgeev #################################################### sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgeev.acml : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1127,7 +1125,7 @@ sgeev.veclib : sgeev.$(SUFFIX) ##################################### Dgeev #################################################### dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgeev.acml : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1144,7 +1142,7 @@ dgeev.veclib : dgeev.$(SUFFIX) ##################################### Cgeev #################################################### cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgeev.acml : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1161,7 +1159,7 @@ cgeev.veclib : cgeev.$(SUFFIX) ##################################### Zgeev #################################################### zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgeev.acml : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1177,7 +1175,7 @@ zgeev.veclib : zgeev.$(SUFFIX) ##################################### Sgetri #################################################### sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgetri.acml : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1193,7 +1191,7 @@ sgetri.veclib : sgetri.$(SUFFIX) ##################################### Dgetri #################################################### dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgetri.acml : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1210,7 +1208,7 @@ dgetri.veclib : dgetri.$(SUFFIX) ##################################### Cgetri #################################################### cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgetri.acml : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1227,7 +1225,7 @@ cgetri.veclib : cgetri.$(SUFFIX) ##################################### Zgetri #################################################### zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgetri.acml : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1243,7 +1241,7 @@ zgetri.veclib : zgetri.$(SUFFIX) ##################################### Spotrf #################################################### spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm spotrf.acml : spotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1259,7 +1257,7 @@ spotrf.veclib : spotrf.$(SUFFIX) ##################################### Dpotrf #################################################### dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dpotrf.acml : dpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1276,7 +1274,7 @@ dpotrf.veclib : dpotrf.$(SUFFIX) ##################################### Cpotrf #################################################### cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cpotrf.acml : cpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1293,7 +1291,7 @@ cpotrf.veclib : cpotrf.$(SUFFIX) ##################################### Zpotrf #################################################### zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zpotrf.acml : zpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1310,7 +1308,7 @@ zpotrf.veclib : zpotrf.$(SUFFIX) ##################################### Chemv #################################################### chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm chemv.acml : chemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1327,7 +1325,7 @@ chemv.veclib : chemv.$(SUFFIX) ##################################### Zhemv #################################################### zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zhemv.acml : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1343,7 +1341,7 @@ zhemv.veclib : zhemv.$(SUFFIX) ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sdot.acml : sdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1359,7 +1357,7 @@ sdot.veclib : sdot.$(SUFFIX) ##################################### Ddot #################################################### ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ddot.acml : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1375,7 +1373,7 @@ ddot.veclib : ddot.$(SUFFIX) ##################################### Cdot #################################################### cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cdot.acml : cdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1391,7 +1389,7 @@ cdot.veclib : cdot-intel.$(SUFFIX) ##################################### Zdot #################################################### zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zdot.acml : zdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1407,7 +1405,7 @@ zdot.veclib : zdot-intel.$(SUFFIX) ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm saxpy.acml : saxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1423,7 +1421,7 @@ saxpy.veclib : saxpy.$(SUFFIX) ##################################### Daxpy #################################################### daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm daxpy.acml : daxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1440,7 +1438,7 @@ daxpy.veclib : daxpy.$(SUFFIX) ##################################### Caxpy #################################################### caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm caxpy.acml : caxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1457,7 +1455,7 @@ caxpy.veclib : caxpy.$(SUFFIX) ##################################### Zaxpy #################################################### zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zaxpy.acml : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1474,7 +1472,7 @@ zaxpy.veclib : zaxpy.$(SUFFIX) ##################################### Scopy #################################################### scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm scopy.acml : scopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1490,7 +1488,7 @@ scopy.veclib : scopy.$(SUFFIX) ##################################### Dcopy #################################################### dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dcopy.acml : dcopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1507,7 +1505,7 @@ dcopy.veclib : dcopy.$(SUFFIX) ##################################### Ccopy #################################################### ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ccopy.acml : ccopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1524,7 +1522,7 @@ ccopy.veclib : ccopy.$(SUFFIX) ##################################### Zcopy #################################################### zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zcopy.acml : zcopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1540,7 +1538,7 @@ zcopy.veclib : zcopy.$(SUFFIX) ##################################### Sscal #################################################### sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sscal.acml : sscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1556,7 +1554,7 @@ sscal.veclib : sscal.$(SUFFIX) ##################################### Dscal #################################################### dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dscal.acml : dscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1573,7 +1571,7 @@ dscal.veclib : dscal.$(SUFFIX) ##################################### Cscal #################################################### cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cscal.acml : cscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1590,7 +1588,7 @@ cscal.veclib : cscal.$(SUFFIX) ##################################### Zscal #################################################### zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zscal.acml : zscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1606,7 +1604,7 @@ zscal.veclib : zscal.$(SUFFIX) ##################################### Sasum #################################################### sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sasum.acml : sasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1622,7 +1620,7 @@ sasum.veclib : sasum.$(SUFFIX) ##################################### Dasum #################################################### dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dasum.acml : dasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1639,7 +1637,7 @@ dasum.veclib : dasum.$(SUFFIX) ##################################### Casum #################################################### casum.goto : casum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm casum.acml : casum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1656,7 +1654,7 @@ casum.veclib : casum.$(SUFFIX) ##################################### Zasum #################################################### zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zasum.acml : zasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1672,7 +1670,7 @@ zasum.veclib : zasum.$(SUFFIX) ##################################### Sswap #################################################### sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sswap.acml : sswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1688,7 +1686,7 @@ sswap.veclib : sswap.$(SUFFIX) ##################################### Dswap #################################################### dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dswap.acml : dswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1705,7 +1703,7 @@ dswap.veclib : dswap.$(SUFFIX) ##################################### Cswap #################################################### cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cswap.acml : cswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1722,7 +1720,7 @@ cswap.veclib : cswap.$(SUFFIX) ##################################### Zswap #################################################### zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zswap.acml : zswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1740,7 +1738,7 @@ zswap.veclib : zswap.$(SUFFIX) ##################################### Cgemm3m #################################################### cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgemm3m.mkl : cgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1751,7 +1749,7 @@ cgemm3m.veclib : cgemm3m.$(SUFFIX) ##################################### Zgemm3m #################################################### zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgemm3m.mkl : zgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) From 6a13a94e711f44886ea18185bc5a8c5503e7ede6 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 2 Jun 2015 13:35:49 +0200 Subject: [PATCH 196/257] added gesv benchmark --- benchmark/Makefile | 89 ++++++++++++++++++ benchmark/gesv.c | 218 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 307 insertions(+) create mode 100644 benchmark/gesv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 01847f839..492d2617f 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -56,6 +56,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto @@ -81,6 +82,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ ssymm.acml dsymm.acml csymm.acml zsymm.acml @@ -107,6 +109,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas @@ -132,6 +135,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl @@ -158,6 +162,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ @@ -184,6 +189,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ ssymm.acml dsymm.acml csymm.acml zsymm.acml @@ -210,6 +216,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas @@ -235,6 +242,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl @@ -267,6 +275,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ + sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib @@ -1735,6 +1744,73 @@ zswap.veclib : zswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sgesv #################################################### +sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgesv.acml : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.atlas : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.mkl : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.veclib : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgesv #################################################### +dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgesv.acml : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.atlas : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.mkl : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.veclib : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgesv #################################################### + +cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgesv.acml : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.atlas : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.mkl : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.veclib : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgesv #################################################### + +zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgesv.acml : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.atlas : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.mkl : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.veclib : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ##################################### Cgemm3m #################################################### cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) @@ -2035,6 +2111,19 @@ zasum.$(SUFFIX) : asum.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +sgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + cgemm3m.$(SUFFIX) : gemm3m.c diff --git a/benchmark/gesv.c b/benchmark/gesv.c new file mode 100644 index 000000000..26ff8bc1a --- /dev/null +++ b/benchmark/gesv.c @@ -0,0 +1,218 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +double fabs(double); + +#undef GESV +#undef GETRS + +#ifndef COMPLEX +#ifdef XDOUBLE +#define GESV BLASFUNC(qgesv) +#elif defined(DOUBLE) +#define GESV BLASFUNC(dgesv) +#else +#define GESV BLASFUNC(sgesv) +#endif +#else +#ifdef XDOUBLE +#define GESV BLASFUNC(xgesv) +#elif defined(DOUBLE) +#define GESV BLASFUNC(zgesv) +#else +#define GESV BLASFUNC(cgesv) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *b; + blasint *ipiv; + + blasint m, i, j, info; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time\n"); + + for(m = from; m <= to; m += step){ + + fprintf(stderr, " %dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + b[i + j * m * COMPSIZE] = 0.0; + } + } + + + for (j = 0; j < m; ++j) { + for (i = 0; i < m * COMPSIZE; ++i) { + b[i] += a[i + j * m * COMPSIZE]; + } + } + + gettimeofday( &start, (struct timezone *)0); + + GESV (&m, &m, a, &m, ipiv, b, &m, &info); + + gettimeofday( &stop, (struct timezone *)0); + + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + + + fprintf(stderr, + "%10.2f MFlops %10.6f s\n", + COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1); + + + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From e19948baa1a65aab64993a00a8f592c5c0e1649c Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 3 Jun 2015 09:11:51 +0200 Subject: [PATCH 197/257] small modification of gemm.c --- benchmark/gemm.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 5a3587622..9348018dc 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -122,7 +122,7 @@ int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 1.0}; + FLOAT beta [] = {0.0, 0.0}; char trans='N'; blasint m, n, i, j; int loops = 1; @@ -168,12 +168,21 @@ int main(int argc, char *argv[]){ has_param_n=1; } - #ifdef linux srandom(getpid()); #endif + + for(j = 0; j < m; j++){ + for(i = 0; i < to * COMPSIZE; i++){ + a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + - fprintf(stderr, " SIZE Flops\n"); + fprintf(stderr, " SIZE Flops Time\n"); for(m = from; m <= to; m += step) { @@ -188,34 +197,23 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m, (int)n); + gettimeofday( &start, (struct timezone *)0); for (l=0; l Date: Mon, 8 Jun 2015 12:58:38 +0200 Subject: [PATCH 198/257] updated geev benchmark --- benchmark/geev.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/benchmark/geev.c b/benchmark/geev.c index a2ca2c315..d3751defb 100644 --- a/benchmark/geev.c +++ b/benchmark/geev.c @@ -144,6 +144,7 @@ int main(int argc, char *argv[]){ FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; FLOAT wkopt[4]; char job='V'; + char jobr='N'; char *p; blasint m, i, j, info,lwork; @@ -202,9 +203,9 @@ int main(int argc, char *argv[]){ lwork = -1; m=to; #ifndef COMPLEX - GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info); + GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info); #else - GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info); + GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info); #endif lwork = (blasint)wkopt[0]; @@ -226,16 +227,16 @@ int main(int argc, char *argv[]){ lwork = -1; #ifndef COMPLEX - GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info); + GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info); #else - GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info); + GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info); #endif lwork = (blasint)wkopt[0]; #ifndef COMPLEX - GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info); + GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info); #else - GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info); + GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info); #endif gettimeofday( &stop, (struct timezone *)0); From 8614057ea9d10a14d7986935150fc4fa78846cc6 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 8 Jun 2015 14:06:38 +0200 Subject: [PATCH 199/257] added benchmark scripts for numpy, octave and R --- benchmark/scripts/NUMPY/cgemm.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/cgemv.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/daxpy.py | 58 ++++++++++++++++++++++++++++ benchmark/scripts/NUMPY/ddot.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/deig.py | 55 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/dgemm.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/dgemv.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/dgesv.py | 58 ++++++++++++++++++++++++++++ benchmark/scripts/NUMPY/dsolve.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/sdot.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/sgemm.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/sgemv.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/zgemm.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/NUMPY/zgemv.py | 56 +++++++++++++++++++++++++++ benchmark/scripts/OCTAVE/cgemm.m | 56 +++++++++++++++++++++++++++ benchmark/scripts/OCTAVE/cgemv.m | 56 +++++++++++++++++++++++++++ benchmark/scripts/OCTAVE/deig.m | 56 +++++++++++++++++++++++++++ benchmark/scripts/OCTAVE/dgemm.m | 56 +++++++++++++++++++++++++++ benchmark/scripts/OCTAVE/dgemv.m | 56 +++++++++++++++++++++++++++ benchmark/scripts/OCTAVE/dsolve.m | 59 +++++++++++++++++++++++++++++ benchmark/scripts/OCTAVE/sgemm.m | 56 +++++++++++++++++++++++++++ benchmark/scripts/OCTAVE/sgemv.m | 56 +++++++++++++++++++++++++++ benchmark/scripts/OCTAVE/zgemm.m | 56 +++++++++++++++++++++++++++ benchmark/scripts/OCTAVE/zgemv.m | 56 +++++++++++++++++++++++++++ benchmark/scripts/R/deig.R | 62 ++++++++++++++++++++++++++++++ benchmark/scripts/R/dgemm.R | 63 +++++++++++++++++++++++++++++++ benchmark/scripts/R/dsolve.R | 63 +++++++++++++++++++++++++++++++ 27 files changed, 1538 insertions(+) create mode 100755 benchmark/scripts/NUMPY/cgemm.py create mode 100755 benchmark/scripts/NUMPY/cgemv.py create mode 100755 benchmark/scripts/NUMPY/daxpy.py create mode 100755 benchmark/scripts/NUMPY/ddot.py create mode 100755 benchmark/scripts/NUMPY/deig.py create mode 100755 benchmark/scripts/NUMPY/dgemm.py create mode 100755 benchmark/scripts/NUMPY/dgemv.py create mode 100755 benchmark/scripts/NUMPY/dgesv.py create mode 100755 benchmark/scripts/NUMPY/dsolve.py create mode 100755 benchmark/scripts/NUMPY/sdot.py create mode 100755 benchmark/scripts/NUMPY/sgemm.py create mode 100755 benchmark/scripts/NUMPY/sgemv.py create mode 100755 benchmark/scripts/NUMPY/zgemm.py create mode 100755 benchmark/scripts/NUMPY/zgemv.py create mode 100755 benchmark/scripts/OCTAVE/cgemm.m create mode 100755 benchmark/scripts/OCTAVE/cgemv.m create mode 100755 benchmark/scripts/OCTAVE/deig.m create mode 100755 benchmark/scripts/OCTAVE/dgemm.m create mode 100755 benchmark/scripts/OCTAVE/dgemv.m create mode 100755 benchmark/scripts/OCTAVE/dsolve.m create mode 100755 benchmark/scripts/OCTAVE/sgemm.m create mode 100755 benchmark/scripts/OCTAVE/sgemv.m create mode 100755 benchmark/scripts/OCTAVE/zgemm.m create mode 100755 benchmark/scripts/OCTAVE/zgemv.m create mode 100755 benchmark/scripts/R/deig.R create mode 100755 benchmark/scripts/R/dgemm.R create mode 100755 benchmark/scripts/R/dsolve.R diff --git a/benchmark/scripts/NUMPY/cgemm.py b/benchmark/scripts/NUMPY/cgemm.py new file mode 100755 index 000000000..b35d3b896 --- /dev/null +++ b/benchmark/scripts/NUMPY/cgemm.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_cgemm(N,l): + + A = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j; + B = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j; + + start = time.time(); + for i in range(0,l): + ref = numpy.dot(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 8*N*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_cgemm(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/cgemv.py b/benchmark/scripts/NUMPY/cgemv.py new file mode 100755 index 000000000..aa0ac9d64 --- /dev/null +++ b/benchmark/scripts/NUMPY/cgemv.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_cgemv(N,l): + + A = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j; + B = randn(N).astype('float32') + randn(N).astype('float32') * 1j; + + start = time.time(); + for i in range(0,l): + ref = numpy.dot(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 8*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_cgemv(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/daxpy.py b/benchmark/scripts/NUMPY/daxpy.py new file mode 100755 index 000000000..db2e0e607 --- /dev/null +++ b/benchmark/scripts/NUMPY/daxpy.py @@ -0,0 +1,58 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn +from scipy.linalg.blas import daxpy + + +def run_daxpy(N,l): + + x = randn(N).astype('float64') + y = randn(N).astype('float64') + + start = time.time(); + for i in range(0,l): + y = daxpy(x,y, a=2.0 ) + end = time.time() + + timediff = (end -start) + mflops = ( 2*N ) *l / timediff + mflops *= 1e-6 + + size = "%d" % (N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_daxpy(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/ddot.py b/benchmark/scripts/NUMPY/ddot.py new file mode 100755 index 000000000..0f4ced339 --- /dev/null +++ b/benchmark/scripts/NUMPY/ddot.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_ddot(N,l): + + A = randn(N).astype('float64') + B = randn(N).astype('float64') + + start = time.time(); + for i in range(0,l): + ref = numpy.dot(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 2*N ) *l / timediff + mflops *= 1e-6 + + size = "%d" % (N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_ddot(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/deig.py b/benchmark/scripts/NUMPY/deig.py new file mode 100755 index 000000000..aac7abe15 --- /dev/null +++ b/benchmark/scripts/NUMPY/deig.py @@ -0,0 +1,55 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_deig(N,l): + + A = randn(N,N).astype('float64') + + start = time.time(); + for i in range(0,l): + la,v = numpy.linalg.eig(A) + end = time.time() + + timediff = (end -start) + mflops = ( 26.33 *N*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_deig(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/dgemm.py b/benchmark/scripts/NUMPY/dgemm.py new file mode 100755 index 000000000..a31248786 --- /dev/null +++ b/benchmark/scripts/NUMPY/dgemm.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_dgemm(N,l): + + A = randn(N,N).astype('float64') + B = randn(N,N).astype('float64') + + start = time.time(); + for i in range(0,l): + ref = numpy.dot(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 2*N*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_dgemm(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/dgemv.py b/benchmark/scripts/NUMPY/dgemv.py new file mode 100755 index 000000000..bbc295e8e --- /dev/null +++ b/benchmark/scripts/NUMPY/dgemv.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_dgemv(N,l): + + A = randn(N,N).astype('float64') + B = randn(N).astype('float64') + + start = time.time(); + for i in range(0,l): + ref = numpy.dot(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 2*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_dgemv(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/dgesv.py b/benchmark/scripts/NUMPY/dgesv.py new file mode 100755 index 000000000..8adabd146 --- /dev/null +++ b/benchmark/scripts/NUMPY/dgesv.py @@ -0,0 +1,58 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn +from scipy.linalg.lapack import dgesv + +def run_dgesv(N,l): + + a = randn(N,N).astype('float64') + b = randn(N,N).astype('float64') + + start = time.time(); + for i in range(0,l): + dgesv(a,b,1,1) + end = time.time() + + timediff = (end -start) + + mflops = ( 2.0/3.0 *N*N*N + 2.0*N*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_dgesv(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/dsolve.py b/benchmark/scripts/NUMPY/dsolve.py new file mode 100755 index 000000000..1b067a84f --- /dev/null +++ b/benchmark/scripts/NUMPY/dsolve.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_dsolve(N,l): + + A = randn(N,N).astype('float64') + B = randn(N,N).astype('float64') + + start = time.time(); + for i in range(0,l): + ref = numpy.linalg.solve(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 2.0/3.0 *N*N*N + 2.0*N*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_dsolve(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/sdot.py b/benchmark/scripts/NUMPY/sdot.py new file mode 100755 index 000000000..4fe6b8cc9 --- /dev/null +++ b/benchmark/scripts/NUMPY/sdot.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_sdot(N,l): + + A = randn(N).astype('float32') + B = randn(N).astype('float32') + + start = time.time(); + for i in range(0,l): + ref = numpy.dot(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 2*N ) *l / timediff + mflops *= 1e-6 + + size = "%d" % (N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_sdot(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/sgemm.py b/benchmark/scripts/NUMPY/sgemm.py new file mode 100755 index 000000000..1680ec28d --- /dev/null +++ b/benchmark/scripts/NUMPY/sgemm.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_sgemm(N,l): + + A = randn(N,N).astype('float32') + B = randn(N,N).astype('float32') + + start = time.time(); + for i in range(0,l): + ref = numpy.dot(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 2*N*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_sgemm(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/sgemv.py b/benchmark/scripts/NUMPY/sgemv.py new file mode 100755 index 000000000..3fe6add55 --- /dev/null +++ b/benchmark/scripts/NUMPY/sgemv.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_sgemv(N,l): + + A = randn(N,N).astype('float32') + B = randn(N).astype('float32') + + start = time.time(); + for i in range(0,l): + ref = numpy.dot(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 2*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_sgemv(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/zgemm.py b/benchmark/scripts/NUMPY/zgemm.py new file mode 100755 index 000000000..4556d4fd2 --- /dev/null +++ b/benchmark/scripts/NUMPY/zgemm.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_zgemm(N,l): + + A = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j; + B = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j; + + start = time.time(); + for i in range(0,l): + ref = numpy.dot(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 8*N*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_zgemm(i,LOOPS) + diff --git a/benchmark/scripts/NUMPY/zgemv.py b/benchmark/scripts/NUMPY/zgemv.py new file mode 100755 index 000000000..ea69a19bb --- /dev/null +++ b/benchmark/scripts/NUMPY/zgemv.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import os +import sys +import time +import numpy +from numpy.random import randn + +def run_zgemv(N,l): + + A = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j; + B = randn(N).astype('float64') + randn(N).astype('float64') * 1j; + + start = time.time(); + for i in range(0,l): + ref = numpy.dot(A,B) + end = time.time() + + timediff = (end -start) + mflops = ( 8*N*N) *l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N,N) + print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff)) + + +if __name__ == "__main__": + N=128 + NMAX=2048 + NINC=128 + LOOPS=1 + + z=0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p); + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range (N,NMAX+NINC,NINC): + run_zgemv(i,LOOPS) + diff --git a/benchmark/scripts/OCTAVE/cgemm.m b/benchmark/scripts/OCTAVE/cgemm.m new file mode 100755 index 000000000..0e79e71ad --- /dev/null +++ b/benchmark/scripts/OCTAVE/cgemm.m @@ -0,0 +1,56 @@ +#!/usr/bin/octave --silent + +nfrom = 128 ; +nto = 2048; +nstep = 128; +loops = 1; + + +arg_list = argv(); +for i = 1:nargin + + switch(i) + case 1 + nfrom = str2num(arg_list{i}); + case 2 + nto = str2num(arg_list{i}); + case 3 + nstep = str2num(arg_list{i}); + case 4 + loops = str2num(arg_list{i}); + + endswitch + +endfor + +p = getenv("OPENBLAS_LOOPS"); +if p + loops = str2num(p); +endif + +printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops); +printf(" SIZE FLOPS TIME\n"); + +n = nfrom; +while n <= nto + + A = single(rand(n,n)) + single(rand(n,n)) * 1i; + B = single(rand(n,n)) + single(rand(n,n)) * 1i; + start = clock(); + + l=0; + while l < loops + + C = A * B; + l = l + 1; + + endwhile + + timeg = etime(clock(), start); + mflops = ( 4.0 * 2.0*n*n*n *loops ) / ( timeg * 1.0e6 ); + + st1 = sprintf("%dx%d : ", n,n); + printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg); + n = n + nstep; + +endwhile diff --git a/benchmark/scripts/OCTAVE/cgemv.m b/benchmark/scripts/OCTAVE/cgemv.m new file mode 100755 index 000000000..7237983b6 --- /dev/null +++ b/benchmark/scripts/OCTAVE/cgemv.m @@ -0,0 +1,56 @@ +#!/usr/bin/octave --silent + +nfrom = 128 ; +nto = 2048; +nstep = 128; +loops = 1; + + +arg_list = argv(); +for i = 1:nargin + + switch(i) + case 1 + nfrom = str2num(arg_list{i}); + case 2 + nto = str2num(arg_list{i}); + case 3 + nstep = str2num(arg_list{i}); + case 4 + loops = str2num(arg_list{i}); + + endswitch + +endfor + +p = getenv("OPENBLAS_LOOPS"); +if p + loops = str2num(p); +endif + +printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops); +printf(" SIZE FLOPS TIME\n"); + +n = nfrom; +while n <= nto + + A = single(rand(n,n)) + single(rand(n,n)) * 1i; + B = single(rand(n,1)) + single(rand(n,1)) * 1i; + start = clock(); + + l=0; + while l < loops + + C = A * B; + l = l + 1; + + endwhile + + timeg = etime(clock(), start); + mflops = ( 4.0 * 2.0*n*n *loops ) / ( timeg * 1.0e6 ); + + st1 = sprintf("%dx%d : ", n,n); + printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg); + n = n + nstep; + +endwhile diff --git a/benchmark/scripts/OCTAVE/deig.m b/benchmark/scripts/OCTAVE/deig.m new file mode 100755 index 000000000..15c85af39 --- /dev/null +++ b/benchmark/scripts/OCTAVE/deig.m @@ -0,0 +1,56 @@ +#!/usr/bin/octave --silent + +nfrom = 128 ; +nto = 2048; +nstep = 128; +loops = 1; + + +arg_list = argv(); +for i = 1:nargin + + switch(i) + case 1 + nfrom = str2num(arg_list{i}); + case 2 + nto = str2num(arg_list{i}); + case 3 + nstep = str2num(arg_list{i}); + case 4 + loops = str2num(arg_list{i}); + + endswitch + +endfor + +p = getenv("OPENBLAS_LOOPS"); +if p + loops = str2num(p); +endif + +printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops); +printf(" SIZE FLOPS TIME\n"); + +n = nfrom; +while n <= nto + + A = double(rand(n,n)); + start = clock(); + + l=0; + while l < loops + + [V,lambda] = eig(A); + l = l + 1; + + endwhile + + + timeg = etime(clock(), start); + mflops = ( 26.33 *n*n*n ) *loops / ( timeg * 1.0e6 ); + + st1 = sprintf("%dx%d : ", n,n); + printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg ); + n = n + nstep; + +endwhile diff --git a/benchmark/scripts/OCTAVE/dgemm.m b/benchmark/scripts/OCTAVE/dgemm.m new file mode 100755 index 000000000..da4f127ec --- /dev/null +++ b/benchmark/scripts/OCTAVE/dgemm.m @@ -0,0 +1,56 @@ +#!/usr/bin/octave --silent + +nfrom = 128 ; +nto = 2048; +nstep = 128; +loops = 1; + + +arg_list = argv(); +for i = 1:nargin + + switch(i) + case 1 + nfrom = str2num(arg_list{i}); + case 2 + nto = str2num(arg_list{i}); + case 3 + nstep = str2num(arg_list{i}); + case 4 + loops = str2num(arg_list{i}); + + endswitch + +endfor + +p = getenv("OPENBLAS_LOOPS"); +if p + loops = str2num(p); +endif + +printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops); +printf(" SIZE FLOPS TIME\n"); + +n = nfrom; +while n <= nto + + A = double(rand(n,n)); + B = double(rand(n,n)); + start = clock(); + + l=0; + while l < loops + + C = A * B; + l = l + 1; + + endwhile + + timeg = etime(clock(), start); + mflops = ( 2.0*n*n*n *loops ) / ( timeg * 1.0e6 ); + + st1 = sprintf("%dx%d : ", n,n); + printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg); + n = n + nstep; + +endwhile diff --git a/benchmark/scripts/OCTAVE/dgemv.m b/benchmark/scripts/OCTAVE/dgemv.m new file mode 100755 index 000000000..139b14159 --- /dev/null +++ b/benchmark/scripts/OCTAVE/dgemv.m @@ -0,0 +1,56 @@ +#!/usr/bin/octave --silent + +nfrom = 128 ; +nto = 2048; +nstep = 128; +loops = 1; + + +arg_list = argv(); +for i = 1:nargin + + switch(i) + case 1 + nfrom = str2num(arg_list{i}); + case 2 + nto = str2num(arg_list{i}); + case 3 + nstep = str2num(arg_list{i}); + case 4 + loops = str2num(arg_list{i}); + + endswitch + +endfor + +p = getenv("OPENBLAS_LOOPS"); +if p + loops = str2num(p); +endif + +printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops); +printf(" SIZE FLOPS TIME\n"); + +n = nfrom; +while n <= nto + + A = double(rand(n,n)); + B = double(rand(n,1)); + start = clock(); + + l=0; + while l < loops + + C = A * B; + l = l + 1; + + endwhile + + timeg = etime(clock(), start); + mflops = ( 2.0*n*n *loops ) / ( timeg * 1.0e6 ); + + st1 = sprintf("%dx%d : ", n,n); + printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg); + n = n + nstep; + +endwhile diff --git a/benchmark/scripts/OCTAVE/dsolve.m b/benchmark/scripts/OCTAVE/dsolve.m new file mode 100755 index 000000000..fff4b0847 --- /dev/null +++ b/benchmark/scripts/OCTAVE/dsolve.m @@ -0,0 +1,59 @@ +#!/usr/bin/octave --silent + +nfrom = 128 ; +nto = 2048; +nstep = 128; +loops = 1; + + +arg_list = argv(); +for i = 1:nargin + + switch(i) + case 1 + nfrom = str2num(arg_list{i}); + case 2 + nto = str2num(arg_list{i}); + case 3 + nstep = str2num(arg_list{i}); + case 4 + loops = str2num(arg_list{i}); + + endswitch + +endfor + +p = getenv("OPENBLAS_LOOPS"); +if p + loops = str2num(p); +endif + +printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops); +printf(" SIZE FLOPS TIME\n"); + +n = nfrom; +while n <= nto + + A = double(rand(n,n)); + B = double(rand(n,n)); + start = clock(); + + l=0; + while l < loops + + x = linsolve(A,B); + #x = A / B; + l = l + 1; + + endwhile + + + timeg = etime(clock(), start); + #r = norm(A*x - B)/norm(B) + mflops = ( 2.0/3.0 *n*n*n + 2.0*n*n*n ) *loops / ( timeg * 1.0e6 ); + + st1 = sprintf("%dx%d : ", n,n); + printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg ); + n = n + nstep; + +endwhile diff --git a/benchmark/scripts/OCTAVE/sgemm.m b/benchmark/scripts/OCTAVE/sgemm.m new file mode 100755 index 000000000..b79548b72 --- /dev/null +++ b/benchmark/scripts/OCTAVE/sgemm.m @@ -0,0 +1,56 @@ +#!/usr/bin/octave --silent + +nfrom = 128 ; +nto = 2048; +nstep = 128; +loops = 1; + + +arg_list = argv(); +for i = 1:nargin + + switch(i) + case 1 + nfrom = str2num(arg_list{i}); + case 2 + nto = str2num(arg_list{i}); + case 3 + nstep = str2num(arg_list{i}); + case 4 + loops = str2num(arg_list{i}); + + endswitch + +endfor + +p = getenv("OPENBLAS_LOOPS"); +if p + loops = str2num(p); +endif + +printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops); +printf(" SIZE FLOPS TIME\n"); + +n = nfrom; +while n <= nto + + A = single(rand(n,n)); + B = single(rand(n,n)); + start = clock(); + + l=0; + while l < loops + + C = A * B; + l = l + 1; + + endwhile + + timeg = etime(clock(), start); + mflops = ( 2.0*n*n*n *loops ) / ( timeg * 1.0e6 ); + + st1 = sprintf("%dx%d : ", n,n); + printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg); + n = n + nstep; + +endwhile diff --git a/benchmark/scripts/OCTAVE/sgemv.m b/benchmark/scripts/OCTAVE/sgemv.m new file mode 100755 index 000000000..13152886e --- /dev/null +++ b/benchmark/scripts/OCTAVE/sgemv.m @@ -0,0 +1,56 @@ +#!/usr/bin/octave --silent + +nfrom = 128 ; +nto = 2048; +nstep = 128; +loops = 1; + + +arg_list = argv(); +for i = 1:nargin + + switch(i) + case 1 + nfrom = str2num(arg_list{i}); + case 2 + nto = str2num(arg_list{i}); + case 3 + nstep = str2num(arg_list{i}); + case 4 + loops = str2num(arg_list{i}); + + endswitch + +endfor + +p = getenv("OPENBLAS_LOOPS"); +if p + loops = str2num(p); +endif + +printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops); +printf(" SIZE FLOPS TIME\n"); + +n = nfrom; +while n <= nto + + A = single(rand(n,n)); + B = single(rand(n,1)); + start = clock(); + + l=0; + while l < loops + + C = A * B; + l = l + 1; + + endwhile + + timeg = etime(clock(), start); + mflops = ( 2.0*n*n *loops ) / ( timeg * 1.0e6 ); + + st1 = sprintf("%dx%d : ", n,n); + printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg); + n = n + nstep; + +endwhile diff --git a/benchmark/scripts/OCTAVE/zgemm.m b/benchmark/scripts/OCTAVE/zgemm.m new file mode 100755 index 000000000..a748437eb --- /dev/null +++ b/benchmark/scripts/OCTAVE/zgemm.m @@ -0,0 +1,56 @@ +#!/usr/bin/octave --silent + +nfrom = 128 ; +nto = 2048; +nstep = 128; +loops = 1; + + +arg_list = argv(); +for i = 1:nargin + + switch(i) + case 1 + nfrom = str2num(arg_list{i}); + case 2 + nto = str2num(arg_list{i}); + case 3 + nstep = str2num(arg_list{i}); + case 4 + loops = str2num(arg_list{i}); + + endswitch + +endfor + +p = getenv("OPENBLAS_LOOPS"); +if p + loops = str2num(p); +endif + +printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops); +printf(" SIZE FLOPS TIME\n"); + +n = nfrom; +while n <= nto + + A = double(rand(n,n)) + double(rand(n,n)) * 1i; + B = double(rand(n,n)) + double(rand(n,n)) * 1i; + start = clock(); + + l=0; + while l < loops + + C = A * B; + l = l + 1; + + endwhile + + timeg = etime(clock(), start); + mflops = ( 4.0 * 2.0*n*n*n *loops ) / ( timeg * 1.0e6 ); + + st1 = sprintf("%dx%d : ", n,n); + printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg); + n = n + nstep; + +endwhile diff --git a/benchmark/scripts/OCTAVE/zgemv.m b/benchmark/scripts/OCTAVE/zgemv.m new file mode 100755 index 000000000..10a0089ec --- /dev/null +++ b/benchmark/scripts/OCTAVE/zgemv.m @@ -0,0 +1,56 @@ +#!/usr/bin/octave --silent + +nfrom = 128 ; +nto = 2048; +nstep = 128; +loops = 1; + + +arg_list = argv(); +for i = 1:nargin + + switch(i) + case 1 + nfrom = str2num(arg_list{i}); + case 2 + nto = str2num(arg_list{i}); + case 3 + nstep = str2num(arg_list{i}); + case 4 + loops = str2num(arg_list{i}); + + endswitch + +endfor + +p = getenv("OPENBLAS_LOOPS"); +if p + loops = str2num(p); +endif + +printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops); +printf(" SIZE FLOPS TIME\n"); + +n = nfrom; +while n <= nto + + A = double(rand(n,n)) + double(rand(n,n)) * 1i; + B = double(rand(n,1)) + double(rand(n,1)) * 1i; + start = clock(); + + l=0; + while l < loops + + C = A * B; + l = l + 1; + + endwhile + + timeg = etime(clock(), start); + mflops = ( 4.0 * 2.0*n*n *loops ) / ( timeg * 1.0e6 ); + + st1 = sprintf("%dx%d : ", n,n); + printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg); + n = n + nstep; + +endwhile diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R new file mode 100755 index 000000000..3521c7c5a --- /dev/null +++ b/benchmark/scripts/R/deig.R @@ -0,0 +1,62 @@ +#!/usr/bin/Rscript + +argv <- commandArgs(trailingOnly = TRUE) + +nfrom = 128 +nto = 2048 +nstep = 128 +loops = 1 + +if ( length(argv) > 0 ) { + + for ( z in 1:length(argv) ) { + + if ( z == 1 ) { + nfrom <- as.numeric(argv[z]) + } else if ( z==2 ) { + nto <- as.numeric(argv[z]) + } else if ( z==3 ) { + nstep <- as.numeric(argv[z]) + } else if ( z==4 ) { + loops <- as.numeric(argv[z]) + } + } + +} + +p=Sys.getenv("OPENBLAS_LOOPS") +if ( p != "" ) { + loops <- as.numeric(p) +} + + +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops)) +cat(sprintf(" SIZE Flops Time\n")) + +n = nfrom +while ( n <= nto ) { + + A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) + + l = 1 + + start <- proc.time()[3] + + while ( l <= loops ) { + + ev <- eigen(A) + l = l + 1 + } + + end <- proc.time()[3] + timeg = end - start + mflops = (26.66 *n*n*n ) * loops / ( timeg * 1.0e6 ) + + st = sprintf("%.0fx%.0f :",n , n) + cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg)) + + n = n + nstep + +} + + diff --git a/benchmark/scripts/R/dgemm.R b/benchmark/scripts/R/dgemm.R new file mode 100755 index 000000000..f1c09c38d --- /dev/null +++ b/benchmark/scripts/R/dgemm.R @@ -0,0 +1,63 @@ +#!/usr/bin/Rscript + +argv <- commandArgs(trailingOnly = TRUE) + +nfrom = 128 +nto = 2048 +nstep = 128 +loops = 1 + +if ( length(argv) > 0 ) { + + for ( z in 1:length(argv) ) { + + if ( z == 1 ) { + nfrom <- as.numeric(argv[z]) + } else if ( z==2 ) { + nto <- as.numeric(argv[z]) + } else if ( z==3 ) { + nstep <- as.numeric(argv[z]) + } else if ( z==4 ) { + loops <- as.numeric(argv[z]) + } + } + +} + +p=Sys.getenv("OPENBLAS_LOOPS") +if ( p != "" ) { + loops <- as.numeric(p) +} + + +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops)) +cat(sprintf(" SIZE Flops Time\n")) + +n = nfrom +while ( n <= nto ) { + + A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) + B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) + + l = 1 + + start <- proc.time()[3] + + while ( l <= loops ) { + + C <- A %*% B + l = l + 1 + } + + end <- proc.time()[3] + timeg = end - start + mflops = ( 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 ) + + st = sprintf("%.0fx%.0f :",n , n) + cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg)) + + n = n + nstep + +} + + diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R new file mode 100755 index 000000000..6c6b77f70 --- /dev/null +++ b/benchmark/scripts/R/dsolve.R @@ -0,0 +1,63 @@ +#!/usr/bin/Rscript + +argv <- commandArgs(trailingOnly = TRUE) + +nfrom = 128 +nto = 2048 +nstep = 128 +loops = 1 + +if ( length(argv) > 0 ) { + + for ( z in 1:length(argv) ) { + + if ( z == 1 ) { + nfrom <- as.numeric(argv[z]) + } else if ( z==2 ) { + nto <- as.numeric(argv[z]) + } else if ( z==3 ) { + nstep <- as.numeric(argv[z]) + } else if ( z==4 ) { + loops <- as.numeric(argv[z]) + } + } + +} + +p=Sys.getenv("OPENBLAS_LOOPS") +if ( p != "" ) { + loops <- as.numeric(p) +} + + +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops)) +cat(sprintf(" SIZE Flops Time\n")) + +n = nfrom +while ( n <= nto ) { + + A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) + B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) + + l = 1 + + start <- proc.time()[3] + + while ( l <= loops ) { + + solve(A,B) + l = l + 1 + } + + end <- proc.time()[3] + timeg = end - start + mflops = (2.0/3.0 *n*n*n + 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 ) + + st = sprintf("%.0fx%.0f :",n , n) + cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg)) + + n = n + nstep + +} + + From 29293160a407b36d226532d83288da06e769bd5e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 8 Jun 2015 10:53:50 -0500 Subject: [PATCH 200/257] Fix #593. Change MACOSX_DEPLOYMENT_TARGET to 10.6. --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index fcaa49a40..628e6fb55 100644 --- a/Makefile.system +++ b/Makefile.system @@ -213,7 +213,7 @@ endif # ifeq ($(OSNAME), Darwin) -export MACOSX_DEPLOYMENT_TARGET=10.2 +export MACOSX_DEPLOYMENT_TARGET=10.6 MD5SUM = md5 -r endif From 4f5691e5c00e906d66b0eeb3d90c87f618d41c6d Mon Sep 17 00:00:00 2001 From: Thomas Anderson Date: Fri, 12 Jun 2015 23:52:07 -0700 Subject: [PATCH 201/257] Fix test execution when USE_OPENMP=0 The standard way to disable OpenMP support is to set USE_OPENMP=0, as indicated by other checks to see if USE_OPENMP equals 1. The problem is obviously then that `ifdef USE_OPENMP` is very much not what we want to test for. This causes tests to fail when no OpenMP library is installed. --- Makefile.system | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile.system b/Makefile.system index 628e6fb55..5c3910989 100644 --- a/Makefile.system +++ b/Makefile.system @@ -591,7 +591,7 @@ else FCOMMON_OPT += -m32 endif endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif endif @@ -603,14 +603,14 @@ ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif ifeq ($(F_COMPILER), FUJITSU) CCOMMON_OPT += -DF_INTERFACE_FUJITSU -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif @@ -628,7 +628,7 @@ endif else FCOMMON_OPT += -q32 endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif @@ -646,7 +646,7 @@ FCOMMON_OPT += -tp p7-64 else FCOMMON_OPT += -tp p7 endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif endif @@ -675,7 +675,7 @@ FCOMMON_OPT += -mabi=n32 endif endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif endif @@ -712,7 +712,7 @@ FCOMMON_OPT += -m64 endif endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FEXTRALIB += -lstdc++ FCOMMON_OPT += -mp endif @@ -760,14 +760,14 @@ FCOMMON_OPT += -m32 else FCOMMON_OPT += -m64 endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -xopenmp=parallel endif endif ifeq ($(F_COMPILER), COMPAQ) CCOMMON_OPT += -DF_INTERFACE_COMPAQ -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif From 9bd962f6557a242925a9d1bad710c4c53c559615 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 13 Jun 2015 10:28:27 +0200 Subject: [PATCH 202/257] modified haswell parameter dgemm_unroll_n --- kernel/Makefile.L3 | 4 + kernel/generic/trmmkernel_4x8.c | 1402 +++++++ kernel/x86_64/KERNEL.HASWELL | 18 +- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 4753 ++++++++++++++++++++++ param.h | 2 +- 5 files changed, 6171 insertions(+), 8 deletions(-) create mode 100644 kernel/generic/trmmkernel_4x8.c create mode 100644 kernel/x86_64/dgemm_kernel_4x8_haswell.S diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index fdbae2daa..4ef351de3 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -32,6 +32,10 @@ ifeq ($(TARGET), GENERIC) USE_TRMM = 1 endif +ifeq ($(CORE), HASWELL) +USE_TRMM = 1 +endif + SKERNELOBJS += \ diff --git a/kernel/generic/trmmkernel_4x8.c b/kernel/generic/trmmkernel_4x8.c new file mode 100644 index 000000000..09c47f147 --- /dev/null +++ b/kernel/generic/trmmkernel_4x8.c @@ -0,0 +1,1402 @@ +#include "common.h" +#include + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT res4_0; + FLOAT res4_1; + FLOAT res4_2; + FLOAT res4_3; + + FLOAT res5_0; + FLOAT res5_1; + FLOAT res5_2; + FLOAT res5_3; + + FLOAT res6_0; + FLOAT res6_1; + FLOAT res6_2; + FLOAT res6_3; + + FLOAT res7_0; + FLOAT res7_1; + FLOAT res7_2; + FLOAT res7_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + FLOAT b4; + FLOAT b5; + FLOAT b6; + FLOAT b7; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $ 24*SIZE, BO +.endm + + +.macro KERNEL4x12_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $ 12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermpd $ 0xb1 , %ymm9 , %ymm9 + vpermpd $ 0xb1 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + vpermpd $ 0xb1 , %ymm13, %ymm13 + vpermpd $ 0xb1 , %ymm15, %ymm15 + + vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 + vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $ 2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $ 1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ + + +.macro INIT4x8 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + +.endm + +.macro KERNEL4x8_I + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vmulpd %ymm0 ,%ymm2 , %ymm8 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, BO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -4 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 0 * SIZE(BO), %ymm2 + addq $ 16*SIZE, BO +.endm + + +.macro KERNEL4x8_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + addq $ 8*SIZE, BO +.endm + +.macro KERNEL4x8_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 8*SIZE, BO + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + +.endm + + +.macro SAVE4x8 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermpd $ 0xb1 , %ymm9 , %ymm9 + vpermpd $ 0xb1 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL2x8_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 2*SIZE, AO + +.endm + +.macro SAVE2x8 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL1x8_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 1*SIZE, AO + +.endm + +.macro SAVE1x8 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + + +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $ 4*SIZE, BO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $ 8*SIZE, BO +.endm + + +.macro KERNEL4x4_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $ 4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $ 4*SIZE, BO + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $ 4*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $ 2*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 2*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $ 2*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + + +.macro KERNEL4x1 + + vbroadcastsd -12 * SIZE(BO), %ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm1 + vbroadcastsd -10 * SIZE(BO), %ymm2 + vbroadcastsd -9 * SIZE(BO), %ymm3 + + vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 + + vbroadcastsd -8 * SIZE(BO), %ymm0 + vbroadcastsd -7 * SIZE(BO), %ymm1 + + vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 + + vbroadcastsd -6 * SIZE(BO), %ymm2 + vbroadcastsd -5 * SIZE(BO), %ymm3 + + vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 + vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 + + addq $ 8 *SIZE, BO + addq $ 32*SIZE, AO + +.endm + + +.macro KERNEL4x1_SUB + vbroadcastsd -12 * SIZE(BO), %ymm2 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm2 , %ymm4 + addq $ 1*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vaddpd %ymm4,%ymm5, %ymm4 + vaddpd %ymm6,%ymm7, %ymm6 + vaddpd %ymm4,%ymm6, %ymm4 + + vmulpd %ymm0 , %ymm4 , %ymm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %ymm4, %ymm4 + +#endif + + vmovups %ymm4 , (CO1) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $ 1*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $ 1*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $ 1*SIZE, CO1 +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $24, %rdi + divq %rdi // N / 24 + movq %rax, Ndiv12 // N / 24 + movq %rdx, Nmod12 // N % 24 + + + movq Ndiv12, J + cmpq $ 0, J + je .L8_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values from BO1 + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + movq BO2 , B + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm2 + vmovups 0 * SIZE(BO2), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + ALIGN_5 +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + + +/**************************************************************************************************/ + +.L13_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values + movq B, BO2 + leaq (B,%rax, SIZE), BO3 // next offset to BO2 + leaq (BO3,%rax, SIZE), B // next offset to B + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + + +.L13_02b: + + vmovups 4 * SIZE(BO2), %ymm1 + vmovups 0 * SIZE(BO3), %ymm2 + vmovups 4 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO2 + addq $ 8*SIZE,BO3 + addq $ 12*SIZE,BO + decq %rax + jnz .L13_02b + + + +.L13_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L13_20 + + ALIGN_4 + +.L13_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L13_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L13_12a + + ALIGN_5 +.L13_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L13_12 + +.L13_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + + +.L13_13: + + test $1, %rax + jz .L13_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + + +.L13_14: + + INIT4x12 + + +.L13_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_19 + + ALIGN_4 + +.L13_17: + + KERNEL4x12_SUB + + dec %rax + jne .L13_17 + ALIGN_4 + + +.L13_19: + + SAVE4x12 + + decq I # i -- + jne .L13_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L13_20: + // Test rest of M + + testq $3, M + jz .L13_100 // to next 16 lines of N + + +.L13_30: + testq $2, M + jz .L13_40 + + ALIGN_4 + +.L13_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L13_36 + ALIGN_4 + +.L13_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L13_32 + ALIGN_4 + +.L13_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_39 + + ALIGN_4 + +.L13_37: + + KERNEL2x12_SUB + + dec %rax + jne .L13_37 + ALIGN_4 + + +.L13_39: + + SAVE2x12 + + ALIGN_4 + +.L13_40: + testq $1, M + jz .L13_100 // to next 3 lines of N + + ALIGN_4 + +.L13_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L13_46 + + ALIGN_4 + +.L13_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L13_42 + ALIGN_4 + +.L13_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_49 + + ALIGN_4 + +.L13_47: + + KERNEL1x12_SUB + + dec %rax + jne .L13_47 + ALIGN_4 + + +.L13_49: + + SAVE1x12 + + ALIGN_4 + +.L13_100: + + decq J // j -- + jg .L12_01 + + + + +/**************************************************************************************************/ + +.L8_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $3, J // j = j / 8 + je .L4_0 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x8 + + movq K, %rax + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x8 + + movq K, %rax + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + + ALIGN_4 + +.L8_100: + + movq K, %rax + salq $3, %rax // * 8 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L8_10 + + + +/**************************************************************************************************/ + +.L4_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + testq $4, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $8, %rdi + divq %rdi // N / 8 + movq %rax, Ndiv12 // N / 8 + movq %rdx, Nmod12 // N % 8 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +/*************************************************************************************************/ +.L8_0: + movq Ndiv12, J + cmpq $ 0, J + je .L4_0 + ALIGN_4 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 8 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x8 + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x8 + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L8_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK // number of values in B +#endif + + + decq J // j -- + jg .L8_10 + + + + + +/*************************************************************************************************/ +.L4_0: + movq Nmod12, J + testq $4, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/param.h b/param.h index 245b678ef..6c9ca83da 100644 --- a/param.h +++ b/param.h @@ -1414,7 +1414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 From e7c969e164900ed19461566af1a2201f98bc0a36 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 13 Jun 2015 16:16:29 +0200 Subject: [PATCH 203/257] added optimized dtrmm_kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/dtrmm_kernel_4x8_haswell.c | 1546 ++++++++++++++++++++++ 2 files changed, 1547 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dtrmm_kernel_4x8_haswell.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index a01dc1ac8..a4686debb 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -40,7 +40,7 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DTRMMKERNEL = ../generic/trmmkernel_4x8.c +DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c DGEMMKERNEL = dgemm_kernel_4x8_haswell.S DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c new file mode 100644 index 000000000..504c784ac --- /dev/null +++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c @@ -0,0 +1,1546 @@ +#include "common.h" +#include + + +static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) __attribute__ ((noinline)); + +static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) +{ + + BLASLONG I = 0; + BLASLONG temp1 = n * 8; + + __asm__ __volatile__ + ( + " vxorpd %%ymm4 , %%ymm4 , %%ymm4 \n\t" + " vxorpd %%ymm5 , %%ymm5 , %%ymm5 \n\t" + " vxorpd %%ymm6 , %%ymm6 , %%ymm6 \n\t" + " vxorpd %%ymm7 , %%ymm7 , %%ymm7 \n\t" + " vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" + " vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" + " vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + " vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + + " cmp $0, %1 \n\t" + " jz 2f \n\t" + + " .align 16 \n\t" + "1: \n\t" + " vmovups (%2,%0,4) , %%ymm0 \n\t" + " vmovups (%3,%0,8) , %%ymm1 \n\t" + " vmovups 32(%3,%0,8) , %%ymm2 \n\t" + + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" + + " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" + + " vpermpd $0x1b , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" + + " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" + + " addq $8 , %0 \n\t" + " cmp %0 , %1 \n\t" + " jne 1b \n\t" + + "2: \n\t" + + " vbroadcastsd (%4), %%ymm0 \n\t" + + " vmulpd %%ymm0 , %%ymm4 , %%ymm4 \n\t" + " vmulpd %%ymm0 , %%ymm5 , %%ymm5 \n\t" + " vmulpd %%ymm0 , %%ymm6 , %%ymm6 \n\t" + " vmulpd %%ymm0 , %%ymm7 , %%ymm7 \n\t" + " vmulpd %%ymm0 , %%ymm8 , %%ymm8 \n\t" + " vmulpd %%ymm0 , %%ymm9 , %%ymm9 \n\t" + " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" + " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" + + " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" + " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" + + " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" + " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" + " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" + " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" + + " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" + " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + + " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" + " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" + " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t" + " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t" + + " vmovups %%ymm4 , (%5) \n\t" + " vmovups %%ymm5 , (%6) \n\t" + " vmovups %%ymm6 , (%7) \n\t" + " vmovups %%ymm7 , (%8) \n\t" + + " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" + " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + + " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" + " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" + " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" + " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" + + " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" + " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + + " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" + " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" + " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t" + " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t" + + " vmovups %%ymm4 , (%9) \n\t" + " vmovups %%ymm5 , (%10) \n\t" + " vmovups %%ymm6 , (%11) \n\t" + " vmovups %%ymm7 , (%12) \n\t" + + : + : + "a" (I), // 0 + "r" (temp1), // 1 + "S" (a), // 2 + "D" (b), // 3 + "r" (alpha), // 4 + "r" (C0), // 5 + "r" (C1), // 6 + "r" (C2), // 7 + "r" (C3), // 8 + "r" (C4), // 9 + "r" (C5), // 10 + "r" (C6), // 11 + "r" (C7) // 12 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + + +} + + + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT res4_0; + FLOAT res4_1; + FLOAT res4_2; + FLOAT res4_3; + + FLOAT res5_0; + FLOAT res5_1; + FLOAT res5_2; + FLOAT res5_3; + + FLOAT res6_0; + FLOAT res6_1; + FLOAT res6_2; + FLOAT res6_3; + + FLOAT res7_0; + FLOAT res7_1; + FLOAT res7_2; + FLOAT res7_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + FLOAT b4; + FLOAT b5; + FLOAT b6; + FLOAT b7; + + BLASLONG off, temp ; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j Date: Fri, 26 Jun 2015 11:54:41 +0800 Subject: [PATCH 204/257] Fix blas lock bug on AArch64. --- common_arm64.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index ee13566f8..aa310c5f2 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -46,21 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void __inline blas_lock(volatile BLASULONG *address){ int register ret; + int register tmp; do { while (*address) {YIELDING;}; __asm__ __volatile__( "1: \n\t" - "ldrex r2, [%1] \n\t" - "mov r2, #0 \n\t" - "strex r3, r2, [%1] \n\t" - "cmp r3, #0 \n\t" - "bne 1b \n\t" - "mov %0 , r3 \n\t" - : "=r"(ret), "=r"(address) + "ldaxr %2, [%1] \n\t" + "mov %2, #0 \n\t" + "stlxr %w0, %2, [%1] \n\t" + "cbnz %w0, 1b \n\t" + "mov %0 , #0 \n\t" + : "=r"(ret), "=r"(address), "=r"(tmp) : "1"(address) - : "memory", "r2" , "r3" + : "memory", "%w0" + //, "%r2" , "%r3" ); From 7ba4fe5afbc6fffdbc6cd2cc6f7ff72050135996 Mon Sep 17 00:00:00 2001 From: Matthew Brandyberry Date: Tue, 21 Jul 2015 12:45:12 -0500 Subject: [PATCH 205/257] ppc64le platform support (ELF ABI v2) --- common_power.h | 31 ++++++++++++++++++ kernel/power/axpy.S | 2 +- kernel/power/axpy_ppc440.S | 2 +- kernel/power/gemm_beta.S | 18 +++++------ kernel/power/gemm_kernel.S | 38 +++++++++++----------- kernel/power/gemm_kernel_altivec.S | 2 +- kernel/power/gemm_kernel_altivec_cell.S | 2 +- kernel/power/gemm_kernel_altivec_g4.S | 2 +- kernel/power/gemm_kernel_cell.S | 38 +++++++++++----------- kernel/power/gemm_kernel_g4.S | 10 +++--- kernel/power/gemm_kernel_power3.S | 30 +++++++++--------- kernel/power/gemm_kernel_power6.S | 10 +++--- kernel/power/gemm_kernel_ppc440.S | 10 +++--- kernel/power/gemv_n.S | 30 +++++++++--------- kernel/power/gemv_n_ppc440.S | 30 +++++++++--------- kernel/power/gemv_t.S | 30 +++++++++--------- kernel/power/gemv_t_ppc440.S | 30 +++++++++--------- kernel/power/ger.S | 30 +++++++++--------- kernel/power/swap.S | 2 +- kernel/power/symv_L.S | 20 ++++++------ kernel/power/symv_U.S | 20 ++++++------ kernel/power/trsm_kernel_LN.S | 38 +++++++++++----------- kernel/power/trsm_kernel_LT.S | 38 +++++++++++----------- kernel/power/trsm_kernel_RT.S | 38 +++++++++++----------- kernel/power/trsm_kernel_cell_LN.S | 38 +++++++++++----------- kernel/power/trsm_kernel_cell_LT.S | 38 +++++++++++----------- kernel/power/trsm_kernel_cell_RT.S | 38 +++++++++++----------- kernel/power/trsm_kernel_power6_LN.S | 10 +++--- kernel/power/trsm_kernel_power6_LT.S | 10 +++--- kernel/power/trsm_kernel_power6_RT.S | 10 +++--- kernel/power/trsm_kernel_ppc440_LN.S | 10 +++--- kernel/power/trsm_kernel_ppc440_LT.S | 10 +++--- kernel/power/trsm_kernel_ppc440_RT.S | 10 +++--- kernel/power/zaxpy.S | 12 +++---- kernel/power/zaxpy_ppc440.S | 12 +++---- kernel/power/zgemm_beta.S | 18 +++++------ kernel/power/zgemm_kernel.S | 40 ++++++++++++------------ kernel/power/zgemm_kernel_altivec.S | 32 +++++++++---------- kernel/power/zgemm_kernel_altivec_cell.S | 32 +++++++++---------- kernel/power/zgemm_kernel_altivec_g4.S | 12 +++---- kernel/power/zgemm_kernel_cell.S | 40 ++++++++++++------------ kernel/power/zgemm_kernel_g4.S | 20 ++++++------ kernel/power/zgemm_kernel_power3.S | 32 +++++++++---------- kernel/power/zgemm_kernel_power6.S | 20 ++++++------ kernel/power/zgemm_kernel_ppc440.S | 20 ++++++------ kernel/power/zgemv_n.S | 30 +++++++++--------- kernel/power/zgemv_n_ppc440.S | 40 ++++++++++++------------ kernel/power/zgemv_t.S | 40 ++++++++++++------------ kernel/power/zgemv_t_ppc440.S | 40 ++++++++++++------------ kernel/power/zger.S | 40 ++++++++++++------------ kernel/power/zswap.S | 12 +++---- kernel/power/zsymv_L.S | 30 +++++++++--------- kernel/power/zsymv_U.S | 30 +++++++++--------- kernel/power/ztrsm_kernel_LN.S | 40 ++++++++++++------------ kernel/power/ztrsm_kernel_LT.S | 40 ++++++++++++------------ kernel/power/ztrsm_kernel_RT.S | 40 ++++++++++++------------ kernel/power/ztrsm_kernel_cell_LN.S | 20 ++++++------ kernel/power/ztrsm_kernel_cell_LT.S | 40 ++++++++++++------------ kernel/power/ztrsm_kernel_cell_RT.S | 20 ++++++------ kernel/power/ztrsm_kernel_power6_LN.S | 20 ++++++------ kernel/power/ztrsm_kernel_power6_LT.S | 20 ++++++------ kernel/power/ztrsm_kernel_power6_RT.S | 20 ++++++------ kernel/power/ztrsm_kernel_ppc440_LN.S | 20 ++++++------ kernel/power/ztrsm_kernel_ppc440_LT.S | 20 ++++++------ kernel/power/ztrsm_kernel_ppc440_RT.S | 20 ++++++------ 65 files changed, 789 insertions(+), 758 deletions(-) diff --git a/common_power.h b/common_power.h index f88f527bd..e9b5cb630 100644 --- a/common_power.h +++ b/common_power.h @@ -495,6 +495,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ REALNAME: #define EPILOGUE .size REALNAME, .-REALNAME #else +#if _CALL_ELF == 2 +#define PROLOGUE \ + .section .text;\ + .align 6;\ + .globl REALNAME;\ + .type REALNAME, @function;\ +REALNAME: +#define EPILOGUE .size REALNAME, .-REALNAME +#else #define PROLOGUE \ .section .text;\ .align 5;\ @@ -514,6 +523,7 @@ REALNAME:;\ .size .REALNAME, .-.REALNAME; \ .section .note.GNU-stack,"",@progbits #endif +#endif #ifdef PROFILE #ifndef __64BIT__ @@ -792,4 +802,25 @@ Lmcount$lazy_ptr: #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif + +#ifdef OS_LINUX +#ifndef __64BIT__ +#define FRAMESLOT(X) (((X) * 4) + 8) +#else +#if _CALL_ELF == 2 +#define FRAMESLOT(X) (((X) * 8) + 96) +#else +#define FRAMESLOT(X) (((X) * 8) + 112) +#endif +#endif +#endif + +#if defined(OS_AIX) || defined(OS_DARWIN) +#ifndef __64BIT__ +#define FRAMESLOT(X) (((X) * 4) + 56) +#else +#define FRAMESLOT(X) (((X) * 8) + 112) +#endif +#endif + #endif diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S index 190f82d6b..fb9789da4 100644 --- a/kernel/power/axpy.S +++ b/kernel/power/axpy.S @@ -106,7 +106,7 @@ stfd f24, 80(SP) #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) - lwz INCY, 56 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif fmr ALPHA, f1 diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S index df3f25e5f..81a660e4d 100644 --- a/kernel/power/axpy_ppc440.S +++ b/kernel/power/axpy_ppc440.S @@ -104,7 +104,7 @@ stfd f24, 80(SP) #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) - lwz INCY, 56 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif fmr ALPHA, f1 diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S index 969f54c61..62d7761ec 100644 --- a/kernel/power/gemm_beta.S +++ b/kernel/power/gemm_beta.S @@ -64,24 +64,24 @@ #ifdef linux #ifndef __64BIT__ - lwz LDC, 8 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else - ld C, 112 + STACKSIZE(SP) - ld LDC, 120 + STACKSIZE(SP) + ld C, FRAMESLOT(0) + STACKSIZE(SP) + ld LDC, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld C, 112 + STACKSIZE(SP) - ld LDC, 120 + STACKSIZE(SP) + ld C, FRAMESLOT(0) + STACKSIZE(SP) + ld LDC, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz C, 56 + STACKSIZE(SP) - lwz LDC, 60 + STACKSIZE(SP) + lwz C, FRAMESLOT(0) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S index cae2fabca..e5e9ec346 100644 --- a/kernel/power/gemm_kernel.S +++ b/kernel/power/gemm_kernel.S @@ -179,7 +179,7 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif @@ -187,17 +187,17 @@ #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -231,29 +231,29 @@ #ifdef linux #ifndef __64BIT__ mr PREA, r10 - lwz PREB, 8 + STACKSIZE(SP) - lwz PREC, 12 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 60 + STACKSIZE(SP) - lwz PREB, 64 + STACKSIZE(SP) - lwz PREC, 68 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz PREA, 56 + STACKSIZE(SP) - lwz PREB, 60 + STACKSIZE(SP) - lwz PREC, 64 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S index 8a525ef22..6c7e78319 100644 --- a/kernel/power/gemm_kernel_altivec.S +++ b/kernel/power/gemm_kernel_altivec.S @@ -241,7 +241,7 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S index ac750c2e8..b7445a1f6 100644 --- a/kernel/power/gemm_kernel_altivec_cell.S +++ b/kernel/power/gemm_kernel_altivec_cell.S @@ -247,7 +247,7 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S index 26339afeb..548150143 100644 --- a/kernel/power/gemm_kernel_altivec_g4.S +++ b/kernel/power/gemm_kernel_altivec_g4.S @@ -241,7 +241,7 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S index 1dbacc7f9..f3d3b8325 100644 --- a/kernel/power/gemm_kernel_cell.S +++ b/kernel/power/gemm_kernel_cell.S @@ -185,7 +185,7 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif @@ -193,17 +193,17 @@ #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -229,29 +229,29 @@ #ifdef linux #ifndef __64BIT__ mr PREA, r10 - lwz PREB, 8 + STACKSIZE(SP) - lwz PREC, 12 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ -xc ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) +xc ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 60 + STACKSIZE(SP) - lwz PREB, 64 + STACKSIZE(SP) - lwz PREC, 68 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz PREA, 56 + STACKSIZE(SP) - lwz PREB, 60 + STACKSIZE(SP) - lwz PREC, 64 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S index b6c849965..259f04c4e 100644 --- a/kernel/power/gemm_kernel_g4.S +++ b/kernel/power/gemm_kernel_g4.S @@ -177,7 +177,7 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif @@ -185,17 +185,17 @@ #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S index 6fe2def67..4a6b5da62 100644 --- a/kernel/power/gemm_kernel_power3.S +++ b/kernel/power/gemm_kernel_power3.S @@ -168,7 +168,7 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif @@ -190,29 +190,29 @@ #ifdef linux #ifndef __64BIT__ mr PREA, r10 - lwz PREB, 8 + STACKSIZE(SP) - lwz PREC, 12 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 60 + STACKSIZE(SP) - lwz PREB, 64 + STACKSIZE(SP) - lwz PREC, 68 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz PREA, 56 + STACKSIZE(SP) - lwz PREB, 60 + STACKSIZE(SP) - lwz PREC, 64 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S index 5f8fa76ce..1a412c4fb 100644 --- a/kernel/power/gemm_kernel_power6.S +++ b/kernel/power/gemm_kernel_power6.S @@ -176,7 +176,7 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif @@ -184,17 +184,17 @@ #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S index 2e86d5130..b128beb38 100644 --- a/kernel/power/gemm_kernel_ppc440.S +++ b/kernel/power/gemm_kernel_ppc440.S @@ -176,7 +176,7 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif @@ -184,17 +184,17 @@ #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index 2b19f0a4e..77587ecb1 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -248,31 +248,31 @@ #ifdef linux #ifndef __64BIT__ - lwz INCY, 8 + STACKSIZE(SP) - lwz BUFFER, 12 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) - lwz BUFFER, 68 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz Y, 56 + STACKSIZE(SP) - lwz INCY, 60 + STACKSIZE(SP) - lwz BUFFER, 64 + STACKSIZE(SP) + lwz Y, FRAMESLOT(0) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/gemv_n_ppc440.S b/kernel/power/gemv_n_ppc440.S index baedebc2b..beb21200a 100644 --- a/kernel/power/gemv_n_ppc440.S +++ b/kernel/power/gemv_n_ppc440.S @@ -201,31 +201,31 @@ #ifdef linux #ifndef __64BIT__ - lwz INCY, 8 + STACKSIZE(SP) - lwz BUFFER, 12 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) - lwz BUFFER, 68 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz Y, 56 + STACKSIZE(SP) - lwz INCY, 60 + STACKSIZE(SP) - lwz BUFFER, 64 + STACKSIZE(SP) + lwz Y, FRAMESLOT(0) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 005e5d56c..817a60b86 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -257,31 +257,31 @@ #ifdef linux #ifndef __64BIT__ - lwz INCY, 8 + STACKSIZE(SP) - lwz BUFFER, 12 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) - lwz BUFFER, 68 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz Y, 56 + STACKSIZE(SP) - lwz INCY, 60 + STACKSIZE(SP) - lwz BUFFER, 64 + STACKSIZE(SP) + lwz Y, FRAMESLOT(0) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S index 62433af19..6e560db6c 100644 --- a/kernel/power/gemv_t_ppc440.S +++ b/kernel/power/gemv_t_ppc440.S @@ -192,31 +192,31 @@ #ifdef linux #ifndef __64BIT__ - lwz INCY, 8 + STACKSIZE(SP) - lwz BUFFER, 12 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) - lwz BUFFER, 68 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz Y, 56 + STACKSIZE(SP) - lwz INCY, 60 + STACKSIZE(SP) - lwz BUFFER, 64 + STACKSIZE(SP) + lwz Y, FRAMESLOT(0) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/ger.S b/kernel/power/ger.S index bc10bf40d..fd397ce8c 100644 --- a/kernel/power/ger.S +++ b/kernel/power/ger.S @@ -226,31 +226,31 @@ #ifdef linux #ifndef __64BIT__ - lwz LDA, 8 + STACKSIZE(SP) - lwz BUFFER, 12 + STACKSIZE(SP) + lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else - ld A, 112 + STACKSIZE(SP) - ld LDA, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld A, FRAMESLOT(0) + STACKSIZE(SP) + ld LDA, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz INCY, 56 + STACKSIZE(SP) - lwz A, 60 + STACKSIZE(SP) - lwz LDA, 64 + STACKSIZE(SP) - lwz BUFFER, 68 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz A, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDA, FRAMESLOT(2) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz A, 56 + STACKSIZE(SP) - lwz LDA, 60 + STACKSIZE(SP) - lwz BUFFER, 64 + STACKSIZE(SP) + lwz A, FRAMESLOT(0) + STACKSIZE(SP) + lwz LDA, FRAMESLOT(1) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else - ld A, 112 + STACKSIZE(SP) - ld LDA, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld A, FRAMESLOT(0) + STACKSIZE(SP) + ld LDA, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/swap.S b/kernel/power/swap.S index f8b56d472..e862b17bb 100644 --- a/kernel/power/swap.S +++ b/kernel/power/swap.S @@ -115,7 +115,7 @@ stfd f31, 136(SP) #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) - lwz INCY, 56 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif slwi INCX, INCX, BASE_SHIFT diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S index fbf735abd..f7d768c50 100644 --- a/kernel/power/symv_L.S +++ b/kernel/power/symv_L.S @@ -250,26 +250,26 @@ #ifdef linux #ifndef __64BIT__ - lwz BUFFER, 56 + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else - ld INCY, 112 + STACKSIZE(SP) - ld BUFFER, 120 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz Y, 56 + STACKSIZE(SP) - lwz INCY, 60 + STACKSIZE(SP) - lwz BUFFER, 64 + STACKSIZE(SP) + lwz Y, FRAMESLOT(0) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz INCY, 56 + STACKSIZE(SP) - lwz BUFFER, 60 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #else - ld INCY, 112 + STACKSIZE(SP) - ld BUFFER, 120 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S index ec1aeea39..d8e082397 100644 --- a/kernel/power/symv_U.S +++ b/kernel/power/symv_U.S @@ -249,26 +249,26 @@ #ifdef linux #ifndef __64BIT__ - lwz BUFFER, 56 + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else - ld INCY, 112 + STACKSIZE(SP) - ld BUFFER, 120 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz Y, 56 + STACKSIZE(SP) - lwz INCY, 60 + STACKSIZE(SP) - lwz BUFFER, 64 + STACKSIZE(SP) + lwz Y, FRAMESLOT(0) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz INCY, 56 + STACKSIZE(SP) - lwz BUFFER, 60 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #else - ld INCY, 112 + STACKSIZE(SP) - ld BUFFER, 120 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S index 0c13a25a4..7983c573b 100644 --- a/kernel/power/trsm_kernel_LN.S +++ b/kernel/power/trsm_kernel_LN.S @@ -174,24 +174,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -239,29 +239,29 @@ #ifdef linux #ifndef __64BIT__ mr PREA, r10 - lwz PREB, 8 + STACKSIZE(SP) - lwz PREC, 12 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 60 + STACKSIZE(SP) - lwz PREB, 64 + STACKSIZE(SP) - lwz PREC, 68 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz PREA, 56 + STACKSIZE(SP) - lwz PREB, 60 + STACKSIZE(SP) - lwz PREC, 64 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S index 06481e5e9..c561fd014 100644 --- a/kernel/power/trsm_kernel_LT.S +++ b/kernel/power/trsm_kernel_LT.S @@ -174,24 +174,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -260,29 +260,29 @@ #ifdef linux #ifndef __64BIT__ mr PREA, r10 - lwz PREB, 8 + STACKSIZE(SP) - lwz PREC, 12 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 60 + STACKSIZE(SP) - lwz PREB, 64 + STACKSIZE(SP) - lwz PREC, 68 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz PREA, 56 + STACKSIZE(SP) - lwz PREB, 60 + STACKSIZE(SP) - lwz PREC, 64 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S index 1777ba86d..07b88402c 100644 --- a/kernel/power/trsm_kernel_RT.S +++ b/kernel/power/trsm_kernel_RT.S @@ -174,24 +174,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -257,29 +257,29 @@ #ifdef linux #ifndef __64BIT__ mr PREA, r10 - lwz PREB, 8 + STACKSIZE(SP) - lwz PREC, 12 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 60 + STACKSIZE(SP) - lwz PREB, 64 + STACKSIZE(SP) - lwz PREC, 68 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz PREA, 56 + STACKSIZE(SP) - lwz PREB, 60 + STACKSIZE(SP) - lwz PREC, 64 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S index b5ed925ed..803530cbb 100644 --- a/kernel/power/trsm_kernel_cell_LN.S +++ b/kernel/power/trsm_kernel_cell_LN.S @@ -174,24 +174,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -234,29 +234,29 @@ #ifdef linux #ifndef __64BIT__ mr PREA, r10 - lwz PREB, 8 + STACKSIZE(SP) - lwz PREC, 12 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 60 + STACKSIZE(SP) - lwz PREB, 64 + STACKSIZE(SP) - lwz PREC, 68 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz PREA, 56 + STACKSIZE(SP) - lwz PREB, 60 + STACKSIZE(SP) - lwz PREC, 64 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S index cdc6f7514..105e7d43c 100644 --- a/kernel/power/trsm_kernel_cell_LT.S +++ b/kernel/power/trsm_kernel_cell_LT.S @@ -174,24 +174,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -260,29 +260,29 @@ #ifdef linux #ifndef __64BIT__ mr PREA, r10 - lwz PREB, 8 + STACKSIZE(SP) - lwz PREC, 12 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 60 + STACKSIZE(SP) - lwz PREB, 64 + STACKSIZE(SP) - lwz PREC, 68 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz PREA, 56 + STACKSIZE(SP) - lwz PREB, 60 + STACKSIZE(SP) - lwz PREC, 64 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S index 731f52c19..a54a261cb 100644 --- a/kernel/power/trsm_kernel_cell_RT.S +++ b/kernel/power/trsm_kernel_cell_RT.S @@ -174,24 +174,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -234,29 +234,29 @@ #ifdef linux #ifndef __64BIT__ mr PREA, r10 - lwz PREB, 8 + STACKSIZE(SP) - lwz PREC, 12 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 112 + STACKSIZE(SP) - ld PREB, 120 + STACKSIZE(SP) - ld PREC, 128 + STACKSIZE(SP) + ld PREA, FRAMESLOT(0) + STACKSIZE(SP) + ld PREB, FRAMESLOT(1) + STACKSIZE(SP) + ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 60 + STACKSIZE(SP) - lwz PREB, 64 + STACKSIZE(SP) - lwz PREC, 68 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz PREA, 56 + STACKSIZE(SP) - lwz PREB, 60 + STACKSIZE(SP) - lwz PREC, 64 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) + lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S index 2f85cd14b..937a6761a 100644 --- a/kernel/power/trsm_kernel_power6_LN.S +++ b/kernel/power/trsm_kernel_power6_LN.S @@ -173,24 +173,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S index 6b3d21b14..924f00ec0 100644 --- a/kernel/power/trsm_kernel_power6_LT.S +++ b/kernel/power/trsm_kernel_power6_LT.S @@ -174,24 +174,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S index f6b2e5cfb..40ee5e28d 100644 --- a/kernel/power/trsm_kernel_power6_RT.S +++ b/kernel/power/trsm_kernel_power6_RT.S @@ -173,24 +173,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S index 265e79e0f..6b7312101 100644 --- a/kernel/power/trsm_kernel_ppc440_LN.S +++ b/kernel/power/trsm_kernel_ppc440_LN.S @@ -185,24 +185,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S index de7ff7415..28b109b96 100644 --- a/kernel/power/trsm_kernel_ppc440_LT.S +++ b/kernel/power/trsm_kernel_ppc440_LT.S @@ -170,24 +170,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S index e8d202d97..df80cd393 100644 --- a/kernel/power/trsm_kernel_ppc440_RT.S +++ b/kernel/power/trsm_kernel_ppc440_RT.S @@ -185,24 +185,24 @@ #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 112 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else - lwz OFFSET, 56 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S index 1acd729ae..ac5b249bb 100644 --- a/kernel/power/zaxpy.S +++ b/kernel/power/zaxpy.S @@ -124,19 +124,19 @@ stfd f25, 88(SP) #if defined(linux) && defined(__64BIT__) - ld INCY, 112 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld INCY, 112 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz INCY, 56 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S index 1ac232444..b5c604e91 100644 --- a/kernel/power/zaxpy_ppc440.S +++ b/kernel/power/zaxpy_ppc440.S @@ -113,19 +113,19 @@ stfd f25, 88(SP) #if defined(linux) && defined(__64BIT__) - ld INCY, 112 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld INCY, 112 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz INCY, 56 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S index 4a9cbd8bb..43b72ca15 100644 --- a/kernel/power/zgemm_beta.S +++ b/kernel/power/zgemm_beta.S @@ -64,24 +64,24 @@ #ifdef linux #ifndef __64BIT__ - lwz LDC, 8 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else - ld C, 120 + STACKSIZE(SP) - ld LDC, 128 + STACKSIZE(SP) + ld C, FRAMESLOT(1) + STACKSIZE(SP) + ld LDC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld C, 120 + STACKSIZE(SP) - ld LDC, 128 + STACKSIZE(SP) + ld C, FRAMESLOT(1) + STACKSIZE(SP) + ld LDC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz C, 68 + STACKSIZE(SP) - lwz LDC, 72 + STACKSIZE(SP) + lwz C, FRAMESLOT(3) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(4) + STACKSIZE(SP) #else - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S index 3d6689531..8ec8b674a 100644 --- a/kernel/power/zgemm_kernel.S +++ b/kernel/power/zgemm_kernel.S @@ -171,37 +171,37 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif @@ -233,25 +233,25 @@ #ifdef linux #ifndef __64BIT__ - lwz PREA, 16 + STACKSIZE(SP) - lwz PREC, 20 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 72 + STACKSIZE(SP) - lwz PREC, 76 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz PREA, 68 + STACKSIZE(SP) - lwz PREC, 72 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S index 2267e975a..2b650cd02 100644 --- a/kernel/power/zgemm_kernel_altivec.S +++ b/kernel/power/zgemm_kernel_altivec.S @@ -240,20 +240,20 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -266,25 +266,25 @@ #ifdef linux #ifndef __64BIT__ - lwz PREB, 16 + STACKSIZE(SP) - lwz PREC, 20 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - ld PREB, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREB, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREB, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREB, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREB, 72 + STACKSIZE(SP) - lwz PREC, 76 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(4) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz PREB, 68 + STACKSIZE(SP) - lwz PREC, 72 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(3) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S index 9a1407d6e..642d1f2e7 100644 --- a/kernel/power/zgemm_kernel_altivec_cell.S +++ b/kernel/power/zgemm_kernel_altivec_cell.S @@ -246,20 +246,20 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -272,25 +272,25 @@ #ifdef linux #ifndef __64BIT__ - lwz PREB, 16 + STACKSIZE(SP) - lwz PREC, 20 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - ld PREB, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREB, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREB, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREB, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREB, 72 + STACKSIZE(SP) - lwz PREC, 76 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(4) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz PREB, 68 + STACKSIZE(SP) - lwz PREC, 72 + STACKSIZE(SP) + lwz PREB, FRAMESLOT(3) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S index 4c774a1e3..0f7a6f9aa 100644 --- a/kernel/power/zgemm_kernel_altivec_g4.S +++ b/kernel/power/zgemm_kernel_altivec_g4.S @@ -240,20 +240,20 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S index 5667b130d..8fd6b0afb 100644 --- a/kernel/power/zgemm_kernel_cell.S +++ b/kernel/power/zgemm_kernel_cell.S @@ -177,37 +177,37 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif @@ -232,25 +232,25 @@ #ifdef linux #ifndef __64BIT__ - lwz PREA, 16 + STACKSIZE(SP) - lwz PREC, 20 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 72 + STACKSIZE(SP) - lwz PREC, 76 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz PREA, 68 + STACKSIZE(SP) - lwz PREC, 72 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S index af6f88e99..bf6bf77e8 100644 --- a/kernel/power/zgemm_kernel_g4.S +++ b/kernel/power/zgemm_kernel_g4.S @@ -187,37 +187,37 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S index d7d6e2aea..471d3b9ae 100644 --- a/kernel/power/zgemm_kernel_power3.S +++ b/kernel/power/zgemm_kernel_power3.S @@ -163,20 +163,20 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif @@ -204,25 +204,25 @@ #ifdef linux #ifndef __64BIT__ - lwz PREA, 16 + STACKSIZE(SP) - lwz PREC, 20 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 72 + STACKSIZE(SP) - lwz PREC, 76 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz PREA, 68 + STACKSIZE(SP) - lwz PREC, 72 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S index 3f79c0523..3c28649bc 100644 --- a/kernel/power/zgemm_kernel_power6.S +++ b/kernel/power/zgemm_kernel_power6.S @@ -201,37 +201,37 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S index 075fa2b4f..748b69a0c 100644 --- a/kernel/power/zgemm_kernel_ppc440.S +++ b/kernel/power/zgemm_kernel_ppc440.S @@ -184,37 +184,37 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index ba4685dec..23e0177c0 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -247,31 +247,31 @@ #ifdef linux #ifndef __64BIT__ - lwz INCY, 8 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #else - ld INCX, 112 + STACKSIZE(SP) - ld Y, 120 + STACKSIZE(SP) - ld INCY, 128 + STACKSIZE(SP) + ld INCX, FRAMESLOT(0) + STACKSIZE(SP) + ld Y, FRAMESLOT(1) + STACKSIZE(SP) + ld INCY, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz LDA, 56 + STACKSIZE(SP) - lwz X, 60 + STACKSIZE(SP) - lwz INCX, 64 + STACKSIZE(SP) - lwz Y, 68 + STACKSIZE(SP) - lwz INCY, 72 + STACKSIZE(SP) + lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) + lwz X, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCX, FRAMESLOT(2) + STACKSIZE(SP) + lwz Y, FRAMESLOT(3) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(4) + STACKSIZE(SP) #else - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) #endif #else - ld INCX, 112 + STACKSIZE(SP) - ld Y, 120 + STACKSIZE(SP) - ld INCY, 128 + STACKSIZE(SP) + ld INCX, FRAMESLOT(0) + STACKSIZE(SP) + ld Y, FRAMESLOT(1) + STACKSIZE(SP) + ld INCY, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S index 31e720261..55dd2d84f 100644 --- a/kernel/power/zgemv_n_ppc440.S +++ b/kernel/power/zgemv_n_ppc440.S @@ -225,36 +225,36 @@ #ifdef linux #ifndef __64BIT__ - lwz INCY, 8 + STACKSIZE(SP) - lwz BUFFER, 12 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else - ld INCX, 112 + STACKSIZE(SP) - ld Y, 120 + STACKSIZE(SP) - ld INCY, 128 + STACKSIZE(SP) - ld BUFFER, 136 + STACKSIZE(SP) + ld INCX, FRAMESLOT(0) + STACKSIZE(SP) + ld Y, FRAMESLOT(1) + STACKSIZE(SP) + ld INCY, FRAMESLOT(2) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz LDA, 56 + STACKSIZE(SP) - lwz X, 60 + STACKSIZE(SP) - lwz INCX, 64 + STACKSIZE(SP) - lwz Y, 68 + STACKSIZE(SP) - lwz INCY, 72 + STACKSIZE(SP) - lwz BUFFER, 76 + STACKSIZE(SP) + lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) + lwz X, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCX, FRAMESLOT(2) + STACKSIZE(SP) + lwz Y, FRAMESLOT(3) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(4) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) - lwz BUFFER, 68 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #else - ld INCX, 112 + STACKSIZE(SP) - ld Y, 120 + STACKSIZE(SP) - ld INCY, 128 + STACKSIZE(SP) - ld BUFFER, 136 + STACKSIZE(SP) + ld INCX, FRAMESLOT(0) + STACKSIZE(SP) + ld Y, FRAMESLOT(1) + STACKSIZE(SP) + ld INCY, FRAMESLOT(2) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index bd8ac4043..c0bad3152 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -222,36 +222,36 @@ #ifdef linux #ifndef __64BIT__ - lwz INCY, 8 + STACKSIZE(SP) - lwz BUFFER, 12 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else - ld INCX, 112 + STACKSIZE(SP) - ld Y, 120 + STACKSIZE(SP) - ld INCY, 128 + STACKSIZE(SP) - ld BUFFER, 136 + STACKSIZE(SP) + ld INCX, FRAMESLOT(0) + STACKSIZE(SP) + ld Y, FRAMESLOT(1) + STACKSIZE(SP) + ld INCY, FRAMESLOT(2) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz LDA, 56 + STACKSIZE(SP) - lwz X, 60 + STACKSIZE(SP) - lwz INCX, 64 + STACKSIZE(SP) - lwz Y, 68 + STACKSIZE(SP) - lwz INCY, 72 + STACKSIZE(SP) - lwz BUFFER, 76 + STACKSIZE(SP) + lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) + lwz X, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCX, FRAMESLOT(2) + STACKSIZE(SP) + lwz Y, FRAMESLOT(3) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(4) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) - lwz BUFFER, 68 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #else - ld INCX, 112 + STACKSIZE(SP) - ld Y, 120 + STACKSIZE(SP) - ld INCY, 128 + STACKSIZE(SP) - ld BUFFER, 136 + STACKSIZE(SP) + ld INCX, FRAMESLOT(0) + STACKSIZE(SP) + ld Y, FRAMESLOT(1) + STACKSIZE(SP) + ld INCY, FRAMESLOT(2) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S index 043b9e37b..bfc039a0c 100644 --- a/kernel/power/zgemv_t_ppc440.S +++ b/kernel/power/zgemv_t_ppc440.S @@ -181,36 +181,36 @@ #ifdef linux #ifndef __64BIT__ - lwz INCY, 8 + STACKSIZE(SP) - lwz BUFFER, 12 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else - ld INCX, 112 + STACKSIZE(SP) - ld Y, 120 + STACKSIZE(SP) - ld INCY, 128 + STACKSIZE(SP) - ld BUFFER, 136 + STACKSIZE(SP) + ld INCX, FRAMESLOT(0) + STACKSIZE(SP) + ld Y, FRAMESLOT(1) + STACKSIZE(SP) + ld INCY, FRAMESLOT(2) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz LDA, 56 + STACKSIZE(SP) - lwz X, 60 + STACKSIZE(SP) - lwz INCX, 64 + STACKSIZE(SP) - lwz Y, 68 + STACKSIZE(SP) - lwz INCY, 72 + STACKSIZE(SP) - lwz BUFFER, 76 + STACKSIZE(SP) + lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) + lwz X, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCX, FRAMESLOT(2) + STACKSIZE(SP) + lwz Y, FRAMESLOT(3) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(4) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) - lwz BUFFER, 68 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #else - ld INCX, 112 + STACKSIZE(SP) - ld Y, 120 + STACKSIZE(SP) - ld INCY, 128 + STACKSIZE(SP) - ld BUFFER, 136 + STACKSIZE(SP) + ld INCX, FRAMESLOT(0) + STACKSIZE(SP) + ld Y, FRAMESLOT(1) + STACKSIZE(SP) + ld INCY, FRAMESLOT(2) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/zger.S b/kernel/power/zger.S index 01cb90731..a9a607815 100644 --- a/kernel/power/zger.S +++ b/kernel/power/zger.S @@ -237,36 +237,36 @@ #ifdef linux #ifndef __64BIT__ - lwz LDA, 8 + STACKSIZE(SP) - lwz BUFFER, 12 + STACKSIZE(SP) + lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else - ld INCY, 112 + STACKSIZE(SP) - ld A, 120 + STACKSIZE(SP) - ld LDA, 128 + STACKSIZE(SP) - ld BUFFER, 136 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) + ld A, FRAMESLOT(1) + STACKSIZE(SP) + ld LDA, FRAMESLOT(2) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) - lwz A, 68 + STACKSIZE(SP) - lwz LDA, 72 + STACKSIZE(SP) - lwz BUFFER, 76 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) + lwz A, FRAMESLOT(3) + STACKSIZE(SP) + lwz LDA, FRAMESLOT(4) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz INCY, 56 + STACKSIZE(SP) - lwz A, 60 + STACKSIZE(SP) - lwz LDA, 64 + STACKSIZE(SP) - lwz BUFFER, 68 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) + lwz A, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDA, FRAMESLOT(2) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #else - ld INCY, 112 + STACKSIZE(SP) - ld A, 120 + STACKSIZE(SP) - ld LDA, 128 + STACKSIZE(SP) - ld BUFFER, 136 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) + ld A, FRAMESLOT(1) + STACKSIZE(SP) + ld LDA, FRAMESLOT(2) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S index 048e8ac5f..8befadca2 100644 --- a/kernel/power/zswap.S +++ b/kernel/power/zswap.S @@ -118,19 +118,19 @@ stfd f31, 136(SP) #if defined(linux) && defined(__64BIT__) - ld INCY, 112 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld INCY, 112 + STACKSIZE(SP) + ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz INCX, 56 + STACKSIZE(SP) - lwz Y, 60 + STACKSIZE(SP) - lwz INCY, 64 + STACKSIZE(SP) + lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) + lwz Y, FRAMESLOT(1) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz INCY, 56 + STACKSIZE(SP) + lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S index ad4a8cd5c..b348e328f 100644 --- a/kernel/power/zsymv_L.S +++ b/kernel/power/zsymv_L.S @@ -261,31 +261,31 @@ #ifdef linux #ifndef __64BIT__ - lwz BUFFER, 56 + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz X, 56 + STACKSIZE(SP) - lwz INCX, 60 + STACKSIZE(SP) - lwz Y, 64 + STACKSIZE(SP) - lwz INCY, 68 + STACKSIZE(SP) - lwz BUFFER, 72 + STACKSIZE(SP) + lwz X, FRAMESLOT(0) + STACKSIZE(SP) + lwz INCX, FRAMESLOT(1) + STACKSIZE(SP) + lwz Y, FRAMESLOT(2) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(3) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(4) + STACKSIZE(SP) #else - lwz Y, 56 + STACKSIZE(SP) - lwz INCY, 60 + STACKSIZE(SP) - lwz BUFFER, 64 + STACKSIZE(SP) + lwz Y, FRAMESLOT(0) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S index 4032b66bb..b631cbe35 100644 --- a/kernel/power/zsymv_U.S +++ b/kernel/power/zsymv_U.S @@ -258,31 +258,31 @@ #ifdef linux #ifndef __64BIT__ - lwz BUFFER, 56 + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE - lwz X, 56 + STACKSIZE(SP) - lwz INCX, 60 + STACKSIZE(SP) - lwz Y, 64 + STACKSIZE(SP) - lwz INCY, 68 + STACKSIZE(SP) - lwz BUFFER, 72 + STACKSIZE(SP) + lwz X, FRAMESLOT(0) + STACKSIZE(SP) + lwz INCX, FRAMESLOT(1) + STACKSIZE(SP) + lwz Y, FRAMESLOT(2) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(3) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(4) + STACKSIZE(SP) #else - lwz Y, 56 + STACKSIZE(SP) - lwz INCY, 60 + STACKSIZE(SP) - lwz BUFFER, 64 + STACKSIZE(SP) + lwz Y, FRAMESLOT(0) + STACKSIZE(SP) + lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) + lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else - ld Y, 112 + STACKSIZE(SP) - ld INCY, 120 + STACKSIZE(SP) - ld BUFFER, 128 + STACKSIZE(SP) + ld Y, FRAMESLOT(0) + STACKSIZE(SP) + ld INCY, FRAMESLOT(1) + STACKSIZE(SP) + ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S index 64fb96823..87473b45d 100644 --- a/kernel/power/ztrsm_kernel_LN.S +++ b/kernel/power/ztrsm_kernel_LN.S @@ -168,36 +168,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif @@ -246,25 +246,25 @@ #ifdef linux #ifndef __64BIT__ - lwz PREA, 16 + STACKSIZE(SP) - lwz PREC, 20 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 72 + STACKSIZE(SP) - lwz PREC, 76 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz PREA, 68 + STACKSIZE(SP) - lwz PREC, 72 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S index ae4615cf5..db0860124 100644 --- a/kernel/power/ztrsm_kernel_LT.S +++ b/kernel/power/ztrsm_kernel_LT.S @@ -168,36 +168,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif @@ -249,25 +249,25 @@ #ifdef linux #ifndef __64BIT__ - lwz PREA, 16 + STACKSIZE(SP) - lwz PREC, 20 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 72 + STACKSIZE(SP) - lwz PREC, 76 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz PREA, 68 + STACKSIZE(SP) - lwz PREC, 72 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S index f756dda77..c50ab86df 100644 --- a/kernel/power/ztrsm_kernel_RT.S +++ b/kernel/power/ztrsm_kernel_RT.S @@ -168,36 +168,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif @@ -249,25 +249,25 @@ #ifdef linux #ifndef __64BIT__ - lwz PREA, 16 + STACKSIZE(SP) - lwz PREC, 20 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 72 + STACKSIZE(SP) - lwz PREC, 76 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz PREA, 68 + STACKSIZE(SP) - lwz PREC, 72 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S index 2427a4ddd..884a3e864 100644 --- a/kernel/power/ztrsm_kernel_cell_LN.S +++ b/kernel/power/ztrsm_kernel_cell_LN.S @@ -174,36 +174,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S index 0d88ded9a..388dfe3c2 100644 --- a/kernel/power/ztrsm_kernel_cell_LT.S +++ b/kernel/power/ztrsm_kernel_cell_LT.S @@ -174,36 +174,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif @@ -248,25 +248,25 @@ #ifdef linux #ifndef __64BIT__ - lwz PREA, 16 + STACKSIZE(SP) - lwz PREC, 20 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld PREA, 136 + STACKSIZE(SP) - ld PREC, 144 + STACKSIZE(SP) + ld PREA, FRAMESLOT(3) + STACKSIZE(SP) + ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz PREA, 72 + STACKSIZE(SP) - lwz PREC, 76 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else - lwz PREA, 68 + STACKSIZE(SP) - lwz PREC, 72 + STACKSIZE(SP) + lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) + lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S index 84f2089fa..00b50fe04 100644 --- a/kernel/power/ztrsm_kernel_cell_RT.S +++ b/kernel/power/ztrsm_kernel_cell_RT.S @@ -174,36 +174,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S index 42239bb55..65b8077db 100644 --- a/kernel/power/ztrsm_kernel_power6_LN.S +++ b/kernel/power/ztrsm_kernel_power6_LN.S @@ -186,36 +186,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S index dfae4d60b..c27170604 100644 --- a/kernel/power/ztrsm_kernel_power6_LT.S +++ b/kernel/power/ztrsm_kernel_power6_LT.S @@ -186,36 +186,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S index 79f8b70b8..ff0338cdc 100644 --- a/kernel/power/ztrsm_kernel_power6_RT.S +++ b/kernel/power/ztrsm_kernel_power6_RT.S @@ -186,36 +186,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S index 51db71903..d33522456 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LN.S +++ b/kernel/power/ztrsm_kernel_ppc440_LN.S @@ -179,36 +179,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S index b5e23b3c6..a9e7b891f 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LT.S +++ b/kernel/power/ztrsm_kernel_ppc440_LT.S @@ -179,36 +179,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S index 2bb374d22..43f4b07cb 100644 --- a/kernel/power/ztrsm_kernel_ppc440_RT.S +++ b/kernel/power/ztrsm_kernel_ppc440_RT.S @@ -179,36 +179,36 @@ #ifdef linux #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, 112 + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz B, 56 + STACKSIZE(SP) - lwz C, 60 + STACKSIZE(SP) - lwz LDC, 64 + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else - lwz LDC, 56 + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, 120 + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE - lwz OFFSET, 68 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else - lwz OFFSET, 60 + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif From 7ac7e147d4d9961d9082338efe7d6c6e6aea9a29 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 4 Aug 2015 04:37:05 +0800 Subject: [PATCH 206/257] Fixed cmake building bugs on Linux. Disable LAPACK by default. --- CMakeLists.txt | 42 ++++++++++++++++++++++++++++++++++++++---- cmake/prebuild.cmake | 10 +++++----- kernel/CMakeLists.txt | 4 ++-- kernel/x86_64/KERNEL | 4 ++-- 4 files changed, 47 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c2681141..1d2e5d3c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,10 +6,29 @@ cmake_minimum_required(VERSION 2.8.4) project(OpenBLAS) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 2) -set(OpenBLAS_PATCH_VERSION 13) +set(OpenBLAS_PATCH_VERSION 14) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") enable_language(ASM) +enable_language(C) + +set(OpenBLAS_LIBNAME openblas) + +####### +option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS and CBLAS)" ON) +option(BUILD_DEBUG "Build Debug Version" OFF) +####### +if(BUILD_WITHOUT_LAPACK) +set(NO_LAPACK 1) +endif() + +if(BUILD_DEBUG) +set(CMAKE_BUILD_TYPE Debug) +else() +set(CMAKE_BUILD_TYPE Release) +endif() +####### + message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") @@ -32,6 +51,7 @@ endif () set(SUBDIRS ${BLASDIRS}) if (NOT NO_LAPACK) + message ("error 1") list(APPEND SUBDIRS lapack) endif () @@ -90,15 +110,29 @@ endforeach () # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. -if (NOT NOFORTRAN) +if (NOT NOFORTRAN AND NOT NO_LAPACK) + message ("error 2") include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") -endif () if (NOT NO_LAPACKE) include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake") endif () +endif () # add objects to the openblas lib -add_library(openblas ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS}) +add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS}) + +#only build shared library for MSVC +if(NOT MSVC) +add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS}) +set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) +set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +endif() + +set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES + VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} + SOVERSION ${OpenBLAS_MAJOR_VERSION} +) + # TODO: Why is the config saved here? Is this necessary with CMake? #Save the config files for installation diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 3e2574f77..901c237c4 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -79,10 +79,10 @@ try_compile(GETARCH_RESULT ${GETARCH_DIR} message(STATUS "Running getarch") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way -execute_process(COMMAND ${GETARCH_BIN} 0 OUTPUT_VARIABLE GETARCH_MAKE_OUT) -execute_process(COMMAND ${GETARCH_BIN} 1 OUTPUT_VARIABLE GETARCH_CONF_OUT) +execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 0 OUTPUT_VARIABLE GETARCH_MAKE_OUT) +execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 1 OUTPUT_VARIABLE GETARCH_CONF_OUT) -#message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") +message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") # append config data from getarch to the TARGET file and read in CMake vars file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT}) @@ -99,8 +99,8 @@ try_compile(GETARCH2_RESULT ${GETARCH2_DIR} ) # use the cmake binary w/ the -E param to run a shell command in a cross-platform way -execute_process(COMMAND ${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT) -execute_process(COMMAND ${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT) +execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT) +execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT) # append config data from getarch_2nd to the TARGET file and read in CMake vars file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT}) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 4fe27a7d0..cd71101a5 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -16,8 +16,9 @@ else () endif () SetDefaultL1() -ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") +SetDefaultL2() ParseMakefileVars("${KERNELDIR}/KERNEL") +ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") if (${ARCH} STREQUAL "x86") GenerateNamedObjects("${KERNELDIR}/cpuid.S" "" "" false "" "" true) @@ -67,7 +68,6 @@ foreach (float_type ${FLOAT_TYPES}) endforeach () # Makefile.L2 -SetDefaultL2() GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 3508753ee..02e5a6047 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -119,11 +119,11 @@ XCOPYKERNEL = zcopy.S endif ifndef SDOTKERNEL -SDOTKERNEL = ../generic/dot.c +SDOTKERNEL = ../generic/dot.c endif ifndef DSDOTKERNEL -DSDOTKERNEL = ../generic/dot.c +DSDOTKERNEL = ../generic/dot.c endif ifndef DDOTKERNEL From 1cf2b10224d4bd5ea9b4e1fc0654d9734fbcc777 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 3 Aug 2015 23:55:56 -0500 Subject: [PATCH 207/257] Use pure C generic target on x86 and x86_64. make TARGET=GENERIC ?gemm3m is unimplemented on generic target. --- Makefile.system | 10 ++ kernel/arm/scal.c | 2 +- kernel/generic/zgemm3mkernel_dump.c | 34 ++++++ kernel/x86/KERNEL.generic | 159 ++++++++++++++++++++++++++++ kernel/x86_64/KERNEL | 8 ++ kernel/x86_64/KERNEL.generic | 107 +++++++++++++++++++ 6 files changed, 319 insertions(+), 1 deletion(-) create mode 100644 kernel/generic/zgemm3mkernel_dump.c create mode 100644 kernel/x86/KERNEL.generic diff --git a/Makefile.system b/Makefile.system index 5c3910989..325ee6af9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -336,6 +336,11 @@ ifeq ($(ARCH), x86) ifndef BINARY NO_BINARY_MODE = 1 endif + +ifeq ($(CORE), generic) +NO_EXPRECISION = 1 +endif + ifndef NO_EXPRECISION ifeq ($(F_COMPILER), GFORTRAN) # ifeq logical or. GCC or LSB @@ -354,6 +359,11 @@ endif endif ifeq ($(ARCH), x86_64) + +ifeq ($(CORE), generic) +NO_EXPRECISION = 1 +endif + ifndef NO_EXPRECISION ifeq ($(F_COMPILER), GFORTRAN) # ifeq logical or. GCC or LSB diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c index 4593e2279..91ca76569 100644 --- a/kernel/arm/scal.c +++ b/kernel/arm/scal.c @@ -52,7 +52,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS j++; } - return; + return 0; } diff --git a/kernel/generic/zgemm3mkernel_dump.c b/kernel/generic/zgemm3mkernel_dump.c new file mode 100644 index 000000000..a59bb08ce --- /dev/null +++ b/kernel/generic/zgemm3mkernel_dump.c @@ -0,0 +1,34 @@ +/*************************************************************************** +Copyright (c) 2011-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc) +{ + return 0; +} diff --git a/kernel/x86/KERNEL.generic b/kernel/x86/KERNEL.generic new file mode 100644 index 000000000..672edb069 --- /dev/null +++ b/kernel/x86/KERNEL.generic @@ -0,0 +1,159 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. +CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 3508753ee..9db66818f 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -440,10 +440,18 @@ XGEMMITCOPYOBJ = XGEMMONCOPYOBJ = xgemm_oncopy$(TSUFFIX).$(SUFFIX) XGEMMOTCOPYOBJ = xgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifndef SGEMM_BETA SGEMM_BETA = gemm_beta.S +endif +ifndef DGEMM_BETA DGEMM_BETA = gemm_beta.S +endif +ifndef CGEMM_BETA CGEMM_BETA = zgemm_beta.S +endif +ifndef ZGEMM_BETA ZGEMM_BETA = zgemm_beta.S +endif QGEMM_BETA = ../generic/gemm_beta.c XGEMM_BETA = ../generic/zgemm_beta.c diff --git a/kernel/x86_64/KERNEL.generic b/kernel/x86_64/KERNEL.generic index 2bcd83636..672edb069 100644 --- a/kernel/x86_64/KERNEL.generic +++ b/kernel/x86_64/KERNEL.generic @@ -1,3 +1,8 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + STRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c @@ -50,3 +55,105 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c From b8d64a856a6d0e345ae60e0ee66f47c6900c4ab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Perez?= Date: Wed, 5 Aug 2015 11:02:39 -0300 Subject: [PATCH 208/257] Add POWER7/POWER8 as targets --- TargetList.txt | 2 ++ cpuid_power.c | 1 + getarch.c | 4 +++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/TargetList.txt b/TargetList.txt index 0a9d8b40c..b2878ba32 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -44,6 +44,8 @@ NANO POWER4 POWER5 POWER6 +POWER7 +POWER8 PPCG4 PPC970 PPC970MP diff --git a/cpuid_power.c b/cpuid_power.c index 2fc333dd2..366c6ed08 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -115,6 +115,7 @@ int detect(void){ if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; diff --git a/getarch.c b/getarch.c index d6ecaeb62..d56a37a7a 100644 --- a/getarch.c +++ b/getarch.c @@ -116,6 +116,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_POWER4 */ /* #define FORCE_POWER5 */ /* #define FORCE_POWER6 */ +/* #define FORCE_POWER7 */ +/* #define FORCE_POWER8 */ /* #define FORCE_PPCG4 */ /* #define FORCE_PPC970 */ /* #define FORCE_PPC970MP */ @@ -546,7 +548,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER5" #endif -#ifdef FORCE_POWER6 +#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER6" From 19664f3ef414e2ba23e55db9f6740c4fba0a35d7 Mon Sep 17 00:00:00 2001 From: Hank Anderson Date: Thu, 6 Aug 2015 07:40:06 -0500 Subject: [PATCH 209/257] Added missing lapacke.cmake file. --- cmake/lapacke.cmake | 2067 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2067 insertions(+) create mode 100644 cmake/lapacke.cmake diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake new file mode 100644 index 000000000..ce7f781dd --- /dev/null +++ b/cmake/lapacke.cmake @@ -0,0 +1,2067 @@ + +set(C_SRC + lapacke_cbbcsd.c + lapacke_cbbcsd_work.c + lapacke_cbdsqr.c + lapacke_cbdsqr_work.c + lapacke_cgbbrd.c + lapacke_cgbbrd_work.c + lapacke_cgbcon.c + lapacke_cgbcon_work.c + lapacke_cgbequ.c + lapacke_cgbequ_work.c + lapacke_cgbequb.c + lapacke_cgbequb_work.c + lapacke_cgbrfs.c + lapacke_cgbrfs_work.c + lapacke_cgbsv.c + lapacke_cgbsv_work.c + lapacke_cgbsvx.c + lapacke_cgbsvx_work.c + lapacke_cgbtrf.c + lapacke_cgbtrf_work.c + lapacke_cgbtrs.c + lapacke_cgbtrs_work.c + lapacke_cgebak.c + lapacke_cgebak_work.c + lapacke_cgebal.c + lapacke_cgebal_work.c + lapacke_cgebrd.c + lapacke_cgebrd_work.c + lapacke_cgecon.c + lapacke_cgecon_work.c + lapacke_cgeequ.c + lapacke_cgeequ_work.c + lapacke_cgeequb.c + lapacke_cgeequb_work.c + lapacke_cgees.c + lapacke_cgees_work.c + lapacke_cgeesx.c + lapacke_cgeesx_work.c + lapacke_cgeev.c + lapacke_cgeev_work.c + lapacke_cgeevx.c + lapacke_cgeevx_work.c + lapacke_cgehrd.c + lapacke_cgehrd_work.c + lapacke_cgelq2.c + lapacke_cgelq2_work.c + lapacke_cgelqf.c + lapacke_cgelqf_work.c + lapacke_cgels.c + lapacke_cgels_work.c + lapacke_cgelsd.c + lapacke_cgelsd_work.c + lapacke_cgelss.c + lapacke_cgelss_work.c + lapacke_cgelsy.c + lapacke_cgelsy_work.c + lapacke_cgemqrt.c + lapacke_cgemqrt_work.c + lapacke_cgeqlf.c + lapacke_cgeqlf_work.c + lapacke_cgeqp3.c + lapacke_cgeqp3_work.c + lapacke_cgeqpf.c + lapacke_cgeqpf_work.c + lapacke_cgeqr2.c + lapacke_cgeqr2_work.c + lapacke_cgeqrf.c + lapacke_cgeqrf_work.c + lapacke_cgeqrfp.c + lapacke_cgeqrfp_work.c + lapacke_cgeqrt.c + lapacke_cgeqrt2.c + lapacke_cgeqrt2_work.c + lapacke_cgeqrt3.c + lapacke_cgeqrt3_work.c + lapacke_cgeqrt_work.c + lapacke_cgerfs.c + lapacke_cgerfs_work.c + lapacke_cgerqf.c + lapacke_cgerqf_work.c + lapacke_cgesdd.c + lapacke_cgesdd_work.c + lapacke_cgesv.c + lapacke_cgesv_work.c + lapacke_cgesvd.c + lapacke_cgesvd_work.c + lapacke_cgesvx.c + lapacke_cgesvx_work.c + lapacke_cgetf2.c + lapacke_cgetf2_work.c + lapacke_cgetrf.c + lapacke_cgetrf_work.c + lapacke_cgetri.c + lapacke_cgetri_work.c + lapacke_cgetrs.c + lapacke_cgetrs_work.c + lapacke_cggbak.c + lapacke_cggbak_work.c + lapacke_cggbal.c + lapacke_cggbal_work.c + lapacke_cgges.c + lapacke_cgges_work.c + lapacke_cggesx.c + lapacke_cggesx_work.c + lapacke_cggev.c + lapacke_cggev_work.c + lapacke_cggevx.c + lapacke_cggevx_work.c + lapacke_cggglm.c + lapacke_cggglm_work.c + lapacke_cgghrd.c + lapacke_cgghrd_work.c + lapacke_cgglse.c + lapacke_cgglse_work.c + lapacke_cggqrf.c + lapacke_cggqrf_work.c + lapacke_cggrqf.c + lapacke_cggrqf_work.c + lapacke_cggsvd.c + lapacke_cggsvd_work.c + lapacke_cggsvp.c + lapacke_cggsvp_work.c + lapacke_cgtcon.c + lapacke_cgtcon_work.c + lapacke_cgtrfs.c + lapacke_cgtrfs_work.c + lapacke_cgtsv.c + lapacke_cgtsv_work.c + lapacke_cgtsvx.c + lapacke_cgtsvx_work.c + lapacke_cgttrf.c + lapacke_cgttrf_work.c + lapacke_cgttrs.c + lapacke_cgttrs_work.c + lapacke_chbev.c + lapacke_chbev_work.c + lapacke_chbevd.c + lapacke_chbevd_work.c + lapacke_chbevx.c + lapacke_chbevx_work.c + lapacke_chbgst.c + lapacke_chbgst_work.c + lapacke_chbgv.c + lapacke_chbgv_work.c + lapacke_chbgvd.c + lapacke_chbgvd_work.c + lapacke_chbgvx.c + lapacke_chbgvx_work.c + lapacke_chbtrd.c + lapacke_chbtrd_work.c + lapacke_checon.c + lapacke_checon_work.c + lapacke_cheequb.c + lapacke_cheequb_work.c + lapacke_cheev.c + lapacke_cheev_work.c + lapacke_cheevd.c + lapacke_cheevd_work.c + lapacke_cheevr.c + lapacke_cheevr_work.c + lapacke_cheevx.c + lapacke_cheevx_work.c + lapacke_chegst.c + lapacke_chegst_work.c + lapacke_chegv.c + lapacke_chegv_work.c + lapacke_chegvd.c + lapacke_chegvd_work.c + lapacke_chegvx.c + lapacke_chegvx_work.c + lapacke_cherfs.c + lapacke_cherfs_work.c + lapacke_chesv.c + lapacke_chesv_work.c + lapacke_chesvx.c + lapacke_chesvx_work.c + lapacke_cheswapr.c + lapacke_cheswapr_work.c + lapacke_chetrd.c + lapacke_chetrd_work.c + lapacke_chetrf.c + lapacke_chetrf_work.c + lapacke_chetri.c + lapacke_chetri2.c + lapacke_chetri2_work.c + lapacke_chetri2x.c + lapacke_chetri2x_work.c + lapacke_chetri_work.c + lapacke_chetrs.c + lapacke_chetrs2.c + lapacke_chetrs2_work.c + lapacke_chetrs_work.c + lapacke_chfrk.c + lapacke_chfrk_work.c + lapacke_chgeqz.c + lapacke_chgeqz_work.c + lapacke_chpcon.c + lapacke_chpcon_work.c + lapacke_chpev.c + lapacke_chpev_work.c + lapacke_chpevd.c + lapacke_chpevd_work.c + lapacke_chpevx.c + lapacke_chpevx_work.c + lapacke_chpgst.c + lapacke_chpgst_work.c + lapacke_chpgv.c + lapacke_chpgv_work.c + lapacke_chpgvd.c + lapacke_chpgvd_work.c + lapacke_chpgvx.c + lapacke_chpgvx_work.c + lapacke_chprfs.c + lapacke_chprfs_work.c + lapacke_chpsv.c + lapacke_chpsv_work.c + lapacke_chpsvx.c + lapacke_chpsvx_work.c + lapacke_chptrd.c + lapacke_chptrd_work.c + lapacke_chptrf.c + lapacke_chptrf_work.c + lapacke_chptri.c + lapacke_chptri_work.c + lapacke_chptrs.c + lapacke_chptrs_work.c + lapacke_chsein.c + lapacke_chsein_work.c + lapacke_chseqr.c + lapacke_chseqr_work.c + lapacke_clacgv.c + lapacke_clacgv_work.c + lapacke_clacn2.c + lapacke_clacn2_work.c + lapacke_clacp2.c + lapacke_clacp2_work.c + lapacke_clacpy.c + lapacke_clacpy_work.c + lapacke_clag2z.c + lapacke_clag2z_work.c + lapacke_clange.c + lapacke_clange_work.c + lapacke_clanhe.c + lapacke_clanhe_work.c + lapacke_clansy.c + lapacke_clansy_work.c + lapacke_clantr.c + lapacke_clantr_work.c + lapacke_clapmr.c + lapacke_clapmr_work.c + lapacke_clarfb.c + lapacke_clarfb_work.c + lapacke_clarfg.c + lapacke_clarfg_work.c + lapacke_clarft.c + lapacke_clarft_work.c + lapacke_clarfx.c + lapacke_clarfx_work.c + lapacke_clarnv.c + lapacke_clarnv_work.c + lapacke_claset.c + lapacke_claset_work.c + lapacke_claswp.c + lapacke_claswp_work.c + lapacke_clauum.c + lapacke_clauum_work.c + lapacke_cpbcon.c + lapacke_cpbcon_work.c + lapacke_cpbequ.c + lapacke_cpbequ_work.c + lapacke_cpbrfs.c + lapacke_cpbrfs_work.c + lapacke_cpbstf.c + lapacke_cpbstf_work.c + lapacke_cpbsv.c + lapacke_cpbsv_work.c + lapacke_cpbsvx.c + lapacke_cpbsvx_work.c + lapacke_cpbtrf.c + lapacke_cpbtrf_work.c + lapacke_cpbtrs.c + lapacke_cpbtrs_work.c + lapacke_cpftrf.c + lapacke_cpftrf_work.c + lapacke_cpftri.c + lapacke_cpftri_work.c + lapacke_cpftrs.c + lapacke_cpftrs_work.c + lapacke_cpocon.c + lapacke_cpocon_work.c + lapacke_cpoequ.c + lapacke_cpoequ_work.c + lapacke_cpoequb.c + lapacke_cpoequb_work.c + lapacke_cporfs.c + lapacke_cporfs_work.c + lapacke_cposv.c + lapacke_cposv_work.c + lapacke_cposvx.c + lapacke_cposvx_work.c + lapacke_cpotrf.c + lapacke_cpotrf_work.c + lapacke_cpotri.c + lapacke_cpotri_work.c + lapacke_cpotrs.c + lapacke_cpotrs_work.c + lapacke_cppcon.c + lapacke_cppcon_work.c + lapacke_cppequ.c + lapacke_cppequ_work.c + lapacke_cpprfs.c + lapacke_cpprfs_work.c + lapacke_cppsv.c + lapacke_cppsv_work.c + lapacke_cppsvx.c + lapacke_cppsvx_work.c + lapacke_cpptrf.c + lapacke_cpptrf_work.c + lapacke_cpptri.c + lapacke_cpptri_work.c + lapacke_cpptrs.c + lapacke_cpptrs_work.c + lapacke_cpstrf.c + lapacke_cpstrf_work.c + lapacke_cptcon.c + lapacke_cptcon_work.c + lapacke_cpteqr.c + lapacke_cpteqr_work.c + lapacke_cptrfs.c + lapacke_cptrfs_work.c + lapacke_cptsv.c + lapacke_cptsv_work.c + lapacke_cptsvx.c + lapacke_cptsvx_work.c + lapacke_cpttrf.c + lapacke_cpttrf_work.c + lapacke_cpttrs.c + lapacke_cpttrs_work.c + lapacke_cspcon.c + lapacke_cspcon_work.c + lapacke_csprfs.c + lapacke_csprfs_work.c + lapacke_cspsv.c + lapacke_cspsv_work.c + lapacke_cspsvx.c + lapacke_cspsvx_work.c + lapacke_csptrf.c + lapacke_csptrf_work.c + lapacke_csptri.c + lapacke_csptri_work.c + lapacke_csptrs.c + lapacke_csptrs_work.c + lapacke_cstedc.c + lapacke_cstedc_work.c + lapacke_cstegr.c + lapacke_cstegr_work.c + lapacke_cstein.c + lapacke_cstein_work.c + lapacke_cstemr.c + lapacke_cstemr_work.c + lapacke_csteqr.c + lapacke_csteqr_work.c + lapacke_csycon.c + lapacke_csycon_work.c + lapacke_csyconv.c + lapacke_csyconv_work.c + lapacke_csyequb.c + lapacke_csyequb_work.c + lapacke_csyrfs.c + lapacke_csyrfs_work.c + lapacke_csysv.c + lapacke_csysv_rook.c + lapacke_csysv_rook_work.c + lapacke_csysv_work.c + lapacke_csysvx.c + lapacke_csysvx_work.c + lapacke_csyswapr.c + lapacke_csyswapr_work.c + lapacke_csytrf.c + lapacke_csytrf_work.c + lapacke_csytri.c + lapacke_csytri2.c + lapacke_csytri2_work.c + lapacke_csytri2x.c + lapacke_csytri2x_work.c + lapacke_csytri_work.c + lapacke_csytrs.c + lapacke_csytrs2.c + lapacke_csytrs2_work.c + lapacke_csytrs_work.c + lapacke_ctbcon.c + lapacke_ctbcon_work.c + lapacke_ctbrfs.c + lapacke_ctbrfs_work.c + lapacke_ctbtrs.c + lapacke_ctbtrs_work.c + lapacke_ctfsm.c + lapacke_ctfsm_work.c + lapacke_ctftri.c + lapacke_ctftri_work.c + lapacke_ctfttp.c + lapacke_ctfttp_work.c + lapacke_ctfttr.c + lapacke_ctfttr_work.c + lapacke_ctgevc.c + lapacke_ctgevc_work.c + lapacke_ctgexc.c + lapacke_ctgexc_work.c + lapacke_ctgsen.c + lapacke_ctgsen_work.c + lapacke_ctgsja.c + lapacke_ctgsja_work.c + lapacke_ctgsna.c + lapacke_ctgsna_work.c + lapacke_ctgsyl.c + lapacke_ctgsyl_work.c + lapacke_ctpcon.c + lapacke_ctpcon_work.c + lapacke_ctpmqrt.c + lapacke_ctpmqrt_work.c + lapacke_ctpqrt.c + lapacke_ctpqrt2.c + lapacke_ctpqrt2_work.c + lapacke_ctpqrt_work.c + lapacke_ctprfb.c + lapacke_ctprfb_work.c + lapacke_ctprfs.c + lapacke_ctprfs_work.c + lapacke_ctptri.c + lapacke_ctptri_work.c + lapacke_ctptrs.c + lapacke_ctptrs_work.c + lapacke_ctpttf.c + lapacke_ctpttf_work.c + lapacke_ctpttr.c + lapacke_ctpttr_work.c + lapacke_ctrcon.c + lapacke_ctrcon_work.c + lapacke_ctrevc.c + lapacke_ctrevc_work.c + lapacke_ctrexc.c + lapacke_ctrexc_work.c + lapacke_ctrrfs.c + lapacke_ctrrfs_work.c + lapacke_ctrsen.c + lapacke_ctrsen_work.c + lapacke_ctrsna.c + lapacke_ctrsna_work.c + lapacke_ctrsyl.c + lapacke_ctrsyl_work.c + lapacke_ctrtri.c + lapacke_ctrtri_work.c + lapacke_ctrtrs.c + lapacke_ctrtrs_work.c + lapacke_ctrttf.c + lapacke_ctrttf_work.c + lapacke_ctrttp.c + lapacke_ctrttp_work.c + lapacke_ctzrzf.c + lapacke_ctzrzf_work.c + lapacke_cunbdb.c + lapacke_cunbdb_work.c + lapacke_cuncsd.c + lapacke_cuncsd_work.c + lapacke_cungbr.c + lapacke_cungbr_work.c + lapacke_cunghr.c + lapacke_cunghr_work.c + lapacke_cunglq.c + lapacke_cunglq_work.c + lapacke_cungql.c + lapacke_cungql_work.c + lapacke_cungqr.c + lapacke_cungqr_work.c + lapacke_cungrq.c + lapacke_cungrq_work.c + lapacke_cungtr.c + lapacke_cungtr_work.c + lapacke_cunmbr.c + lapacke_cunmbr_work.c + lapacke_cunmhr.c + lapacke_cunmhr_work.c + lapacke_cunmlq.c + lapacke_cunmlq_work.c + lapacke_cunmql.c + lapacke_cunmql_work.c + lapacke_cunmqr.c + lapacke_cunmqr_work.c + lapacke_cunmrq.c + lapacke_cunmrq_work.c + lapacke_cunmrz.c + lapacke_cunmrz_work.c + lapacke_cunmtr.c + lapacke_cunmtr_work.c + lapacke_cupgtr.c + lapacke_cupgtr_work.c + lapacke_cupmtr.c + lapacke_cupmtr_work.c +) + +set(DSRC + lapacke_dbbcsd.c + lapacke_dbbcsd_work.c + lapacke_dbdsdc.c + lapacke_dbdsdc_work.c + lapacke_dbdsqr.c + lapacke_dbdsqr_work.c + lapacke_ddisna.c + lapacke_ddisna_work.c + lapacke_dgbbrd.c + lapacke_dgbbrd_work.c + lapacke_dgbcon.c + lapacke_dgbcon_work.c + lapacke_dgbequ.c + lapacke_dgbequ_work.c + lapacke_dgbequb.c + lapacke_dgbequb_work.c + lapacke_dgbrfs.c + lapacke_dgbrfs_work.c + lapacke_dgbsv.c + lapacke_dgbsv_work.c + lapacke_dgbsvx.c + lapacke_dgbsvx_work.c + lapacke_dgbtrf.c + lapacke_dgbtrf_work.c + lapacke_dgbtrs.c + lapacke_dgbtrs_work.c + lapacke_dgebak.c + lapacke_dgebak_work.c + lapacke_dgebal.c + lapacke_dgebal_work.c + lapacke_dgebrd.c + lapacke_dgebrd_work.c + lapacke_dgecon.c + lapacke_dgecon_work.c + lapacke_dgeequ.c + lapacke_dgeequ_work.c + lapacke_dgeequb.c + lapacke_dgeequb_work.c + lapacke_dgees.c + lapacke_dgees_work.c + lapacke_dgeesx.c + lapacke_dgeesx_work.c + lapacke_dgeev.c + lapacke_dgeev_work.c + lapacke_dgeevx.c + lapacke_dgeevx_work.c + lapacke_dgehrd.c + lapacke_dgehrd_work.c + lapacke_dgejsv.c + lapacke_dgejsv_work.c + lapacke_dgelq2.c + lapacke_dgelq2_work.c + lapacke_dgelqf.c + lapacke_dgelqf_work.c + lapacke_dgels.c + lapacke_dgels_work.c + lapacke_dgelsd.c + lapacke_dgelsd_work.c + lapacke_dgelss.c + lapacke_dgelss_work.c + lapacke_dgelsy.c + lapacke_dgelsy_work.c + lapacke_dgemqrt.c + lapacke_dgemqrt_work.c + lapacke_dgeqlf.c + lapacke_dgeqlf_work.c + lapacke_dgeqp3.c + lapacke_dgeqp3_work.c + lapacke_dgeqpf.c + lapacke_dgeqpf_work.c + lapacke_dgeqr2.c + lapacke_dgeqr2_work.c + lapacke_dgeqrf.c + lapacke_dgeqrf_work.c + lapacke_dgeqrfp.c + lapacke_dgeqrfp_work.c + lapacke_dgeqrt.c + lapacke_dgeqrt2.c + lapacke_dgeqrt2_work.c + lapacke_dgeqrt3.c + lapacke_dgeqrt3_work.c + lapacke_dgeqrt_work.c + lapacke_dgerfs.c + lapacke_dgerfs_work.c + lapacke_dgerqf.c + lapacke_dgerqf_work.c + lapacke_dgesdd.c + lapacke_dgesdd_work.c + lapacke_dgesv.c + lapacke_dgesv_work.c + lapacke_dgesvd.c + lapacke_dgesvd_work.c + lapacke_dgesvj.c + lapacke_dgesvj_work.c + lapacke_dgesvx.c + lapacke_dgesvx_work.c + lapacke_dgetf2.c + lapacke_dgetf2_work.c + lapacke_dgetrf.c + lapacke_dgetrf_work.c + lapacke_dgetri.c + lapacke_dgetri_work.c + lapacke_dgetrs.c + lapacke_dgetrs_work.c + lapacke_dggbak.c + lapacke_dggbak_work.c + lapacke_dggbal.c + lapacke_dggbal_work.c + lapacke_dgges.c + lapacke_dgges_work.c + lapacke_dggesx.c + lapacke_dggesx_work.c + lapacke_dggev.c + lapacke_dggev_work.c + lapacke_dggevx.c + lapacke_dggevx_work.c + lapacke_dggglm.c + lapacke_dggglm_work.c + lapacke_dgghrd.c + lapacke_dgghrd_work.c + lapacke_dgglse.c + lapacke_dgglse_work.c + lapacke_dggqrf.c + lapacke_dggqrf_work.c + lapacke_dggrqf.c + lapacke_dggrqf_work.c + lapacke_dggsvd.c + lapacke_dggsvd_work.c + lapacke_dggsvp.c + lapacke_dggsvp_work.c + lapacke_dgtcon.c + lapacke_dgtcon_work.c + lapacke_dgtrfs.c + lapacke_dgtrfs_work.c + lapacke_dgtsv.c + lapacke_dgtsv_work.c + lapacke_dgtsvx.c + lapacke_dgtsvx_work.c + lapacke_dgttrf.c + lapacke_dgttrf_work.c + lapacke_dgttrs.c + lapacke_dgttrs_work.c + lapacke_dhgeqz.c + lapacke_dhgeqz_work.c + lapacke_dhsein.c + lapacke_dhsein_work.c + lapacke_dhseqr.c + lapacke_dhseqr_work.c + lapacke_dlacn2.c + lapacke_dlacn2_work.c + lapacke_dlacpy.c + lapacke_dlacpy_work.c + lapacke_dlag2s.c + lapacke_dlag2s_work.c + lapacke_dlamch.c + lapacke_dlamch_work.c + lapacke_dlange.c + lapacke_dlange_work.c + lapacke_dlansy.c + lapacke_dlansy_work.c + lapacke_dlantr.c + lapacke_dlantr_work.c + lapacke_dlapmr.c + lapacke_dlapmr_work.c + lapacke_dlapy2.c + lapacke_dlapy2_work.c + lapacke_dlapy3.c + lapacke_dlapy3_work.c + lapacke_dlarfb.c + lapacke_dlarfb_work.c + lapacke_dlarfg.c + lapacke_dlarfg_work.c + lapacke_dlarft.c + lapacke_dlarft_work.c + lapacke_dlarfx.c + lapacke_dlarfx_work.c + lapacke_dlarnv.c + lapacke_dlarnv_work.c + lapacke_dlartgp.c + lapacke_dlartgp_work.c + lapacke_dlartgs.c + lapacke_dlartgs_work.c + lapacke_dlaset.c + lapacke_dlaset_work.c + lapacke_dlasrt.c + lapacke_dlasrt_work.c + lapacke_dlaswp.c + lapacke_dlaswp_work.c + lapacke_dlauum.c + lapacke_dlauum_work.c + lapacke_dopgtr.c + lapacke_dopgtr_work.c + lapacke_dopmtr.c + lapacke_dopmtr_work.c + lapacke_dorbdb.c + lapacke_dorbdb_work.c + lapacke_dorcsd.c + lapacke_dorcsd_work.c + lapacke_dorgbr.c + lapacke_dorgbr_work.c + lapacke_dorghr.c + lapacke_dorghr_work.c + lapacke_dorglq.c + lapacke_dorglq_work.c + lapacke_dorgql.c + lapacke_dorgql_work.c + lapacke_dorgqr.c + lapacke_dorgqr_work.c + lapacke_dorgrq.c + lapacke_dorgrq_work.c + lapacke_dorgtr.c + lapacke_dorgtr_work.c + lapacke_dormbr.c + lapacke_dormbr_work.c + lapacke_dormhr.c + lapacke_dormhr_work.c + lapacke_dormlq.c + lapacke_dormlq_work.c + lapacke_dormql.c + lapacke_dormql_work.c + lapacke_dormqr.c + lapacke_dormqr_work.c + lapacke_dormrq.c + lapacke_dormrq_work.c + lapacke_dormrz.c + lapacke_dormrz_work.c + lapacke_dormtr.c + lapacke_dormtr_work.c + lapacke_dpbcon.c + lapacke_dpbcon_work.c + lapacke_dpbequ.c + lapacke_dpbequ_work.c + lapacke_dpbrfs.c + lapacke_dpbrfs_work.c + lapacke_dpbstf.c + lapacke_dpbstf_work.c + lapacke_dpbsv.c + lapacke_dpbsv_work.c + lapacke_dpbsvx.c + lapacke_dpbsvx_work.c + lapacke_dpbtrf.c + lapacke_dpbtrf_work.c + lapacke_dpbtrs.c + lapacke_dpbtrs_work.c + lapacke_dpftrf.c + lapacke_dpftrf_work.c + lapacke_dpftri.c + lapacke_dpftri_work.c + lapacke_dpftrs.c + lapacke_dpftrs_work.c + lapacke_dpocon.c + lapacke_dpocon_work.c + lapacke_dpoequ.c + lapacke_dpoequ_work.c + lapacke_dpoequb.c + lapacke_dpoequb_work.c + lapacke_dporfs.c + lapacke_dporfs_work.c + lapacke_dposv.c + lapacke_dposv_work.c + lapacke_dposvx.c + lapacke_dposvx_work.c + lapacke_dpotrf.c + lapacke_dpotrf_work.c + lapacke_dpotri.c + lapacke_dpotri_work.c + lapacke_dpotrs.c + lapacke_dpotrs_work.c + lapacke_dppcon.c + lapacke_dppcon_work.c + lapacke_dppequ.c + lapacke_dppequ_work.c + lapacke_dpprfs.c + lapacke_dpprfs_work.c + lapacke_dppsv.c + lapacke_dppsv_work.c + lapacke_dppsvx.c + lapacke_dppsvx_work.c + lapacke_dpptrf.c + lapacke_dpptrf_work.c + lapacke_dpptri.c + lapacke_dpptri_work.c + lapacke_dpptrs.c + lapacke_dpptrs_work.c + lapacke_dpstrf.c + lapacke_dpstrf_work.c + lapacke_dptcon.c + lapacke_dptcon_work.c + lapacke_dpteqr.c + lapacke_dpteqr_work.c + lapacke_dptrfs.c + lapacke_dptrfs_work.c + lapacke_dptsv.c + lapacke_dptsv_work.c + lapacke_dptsvx.c + lapacke_dptsvx_work.c + lapacke_dpttrf.c + lapacke_dpttrf_work.c + lapacke_dpttrs.c + lapacke_dpttrs_work.c + lapacke_dsbev.c + lapacke_dsbev_work.c + lapacke_dsbevd.c + lapacke_dsbevd_work.c + lapacke_dsbevx.c + lapacke_dsbevx_work.c + lapacke_dsbgst.c + lapacke_dsbgst_work.c + lapacke_dsbgv.c + lapacke_dsbgv_work.c + lapacke_dsbgvd.c + lapacke_dsbgvd_work.c + lapacke_dsbgvx.c + lapacke_dsbgvx_work.c + lapacke_dsbtrd.c + lapacke_dsbtrd_work.c + lapacke_dsfrk.c + lapacke_dsfrk_work.c + lapacke_dsgesv.c + lapacke_dsgesv_work.c + lapacke_dspcon.c + lapacke_dspcon_work.c + lapacke_dspev.c + lapacke_dspev_work.c + lapacke_dspevd.c + lapacke_dspevd_work.c + lapacke_dspevx.c + lapacke_dspevx_work.c + lapacke_dspgst.c + lapacke_dspgst_work.c + lapacke_dspgv.c + lapacke_dspgv_work.c + lapacke_dspgvd.c + lapacke_dspgvd_work.c + lapacke_dspgvx.c + lapacke_dspgvx_work.c + lapacke_dsposv.c + lapacke_dsposv_work.c + lapacke_dsprfs.c + lapacke_dsprfs_work.c + lapacke_dspsv.c + lapacke_dspsv_work.c + lapacke_dspsvx.c + lapacke_dspsvx_work.c + lapacke_dsptrd.c + lapacke_dsptrd_work.c + lapacke_dsptrf.c + lapacke_dsptrf_work.c + lapacke_dsptri.c + lapacke_dsptri_work.c + lapacke_dsptrs.c + lapacke_dsptrs_work.c + lapacke_dstebz.c + lapacke_dstebz_work.c + lapacke_dstedc.c + lapacke_dstedc_work.c + lapacke_dstegr.c + lapacke_dstegr_work.c + lapacke_dstein.c + lapacke_dstein_work.c + lapacke_dstemr.c + lapacke_dstemr_work.c + lapacke_dsteqr.c + lapacke_dsteqr_work.c + lapacke_dsterf.c + lapacke_dsterf_work.c + lapacke_dstev.c + lapacke_dstev_work.c + lapacke_dstevd.c + lapacke_dstevd_work.c + lapacke_dstevr.c + lapacke_dstevr_work.c + lapacke_dstevx.c + lapacke_dstevx_work.c + lapacke_dsycon.c + lapacke_dsycon_work.c + lapacke_dsyconv.c + lapacke_dsyconv_work.c + lapacke_dsyequb.c + lapacke_dsyequb_work.c + lapacke_dsyev.c + lapacke_dsyev_work.c + lapacke_dsyevd.c + lapacke_dsyevd_work.c + lapacke_dsyevr.c + lapacke_dsyevr_work.c + lapacke_dsyevx.c + lapacke_dsyevx_work.c + lapacke_dsygst.c + lapacke_dsygst_work.c + lapacke_dsygv.c + lapacke_dsygv_work.c + lapacke_dsygvd.c + lapacke_dsygvd_work.c + lapacke_dsygvx.c + lapacke_dsygvx_work.c + lapacke_dsyrfs.c + lapacke_dsyrfs_work.c + lapacke_dsysv.c + lapacke_dsysv_rook.c + lapacke_dsysv_rook_work.c + lapacke_dsysv_work.c + lapacke_dsysvx.c + lapacke_dsysvx_work.c + lapacke_dsyswapr.c + lapacke_dsyswapr_work.c + lapacke_dsytrd.c + lapacke_dsytrd_work.c + lapacke_dsytrf.c + lapacke_dsytrf_work.c + lapacke_dsytri.c + lapacke_dsytri2.c + lapacke_dsytri2_work.c + lapacke_dsytri2x.c + lapacke_dsytri2x_work.c + lapacke_dsytri_work.c + lapacke_dsytrs.c + lapacke_dsytrs2.c + lapacke_dsytrs2_work.c + lapacke_dsytrs_work.c + lapacke_dtbcon.c + lapacke_dtbcon_work.c + lapacke_dtbrfs.c + lapacke_dtbrfs_work.c + lapacke_dtbtrs.c + lapacke_dtbtrs_work.c + lapacke_dtfsm.c + lapacke_dtfsm_work.c + lapacke_dtftri.c + lapacke_dtftri_work.c + lapacke_dtfttp.c + lapacke_dtfttp_work.c + lapacke_dtfttr.c + lapacke_dtfttr_work.c + lapacke_dtgevc.c + lapacke_dtgevc_work.c + lapacke_dtgexc.c + lapacke_dtgexc_work.c + lapacke_dtgsen.c + lapacke_dtgsen_work.c + lapacke_dtgsja.c + lapacke_dtgsja_work.c + lapacke_dtgsna.c + lapacke_dtgsna_work.c + lapacke_dtgsyl.c + lapacke_dtgsyl_work.c + lapacke_dtpcon.c + lapacke_dtpcon_work.c + lapacke_dtpmqrt.c + lapacke_dtpmqrt_work.c + lapacke_dtpqrt.c + lapacke_dtpqrt2.c + lapacke_dtpqrt2_work.c + lapacke_dtpqrt_work.c + lapacke_dtprfb.c + lapacke_dtprfb_work.c + lapacke_dtprfs.c + lapacke_dtprfs_work.c + lapacke_dtptri.c + lapacke_dtptri_work.c + lapacke_dtptrs.c + lapacke_dtptrs_work.c + lapacke_dtpttf.c + lapacke_dtpttf_work.c + lapacke_dtpttr.c + lapacke_dtpttr_work.c + lapacke_dtrcon.c + lapacke_dtrcon_work.c + lapacke_dtrevc.c + lapacke_dtrevc_work.c + lapacke_dtrexc.c + lapacke_dtrexc_work.c + lapacke_dtrrfs.c + lapacke_dtrrfs_work.c + lapacke_dtrsen.c + lapacke_dtrsen_work.c + lapacke_dtrsna.c + lapacke_dtrsna_work.c + lapacke_dtrsyl.c + lapacke_dtrsyl_work.c + lapacke_dtrtri.c + lapacke_dtrtri_work.c + lapacke_dtrtrs.c + lapacke_dtrtrs_work.c + lapacke_dtrttf.c + lapacke_dtrttf_work.c + lapacke_dtrttp.c + lapacke_dtrttp_work.c + lapacke_dtzrzf.c + lapacke_dtzrzf_work.c +) + +set(SSRC + lapacke_sbbcsd.c + lapacke_sbbcsd_work.c + lapacke_sbdsdc.c + lapacke_sbdsdc_work.c + lapacke_sbdsqr.c + lapacke_sbdsqr_work.c + lapacke_sdisna.c + lapacke_sdisna_work.c + lapacke_sgbbrd.c + lapacke_sgbbrd_work.c + lapacke_sgbcon.c + lapacke_sgbcon_work.c + lapacke_sgbequ.c + lapacke_sgbequ_work.c + lapacke_sgbequb.c + lapacke_sgbequb_work.c + lapacke_sgbrfs.c + lapacke_sgbrfs_work.c + lapacke_sgbsv.c + lapacke_sgbsv_work.c + lapacke_sgbsvx.c + lapacke_sgbsvx_work.c + lapacke_sgbtrf.c + lapacke_sgbtrf_work.c + lapacke_sgbtrs.c + lapacke_sgbtrs_work.c + lapacke_sgebak.c + lapacke_sgebak_work.c + lapacke_sgebal.c + lapacke_sgebal_work.c + lapacke_sgebrd.c + lapacke_sgebrd_work.c + lapacke_sgecon.c + lapacke_sgecon_work.c + lapacke_sgeequ.c + lapacke_sgeequ_work.c + lapacke_sgeequb.c + lapacke_sgeequb_work.c + lapacke_sgees.c + lapacke_sgees_work.c + lapacke_sgeesx.c + lapacke_sgeesx_work.c + lapacke_sgeev.c + lapacke_sgeev_work.c + lapacke_sgeevx.c + lapacke_sgeevx_work.c + lapacke_sgehrd.c + lapacke_sgehrd_work.c + lapacke_sgejsv.c + lapacke_sgejsv_work.c + lapacke_sgelq2.c + lapacke_sgelq2_work.c + lapacke_sgelqf.c + lapacke_sgelqf_work.c + lapacke_sgels.c + lapacke_sgels_work.c + lapacke_sgelsd.c + lapacke_sgelsd_work.c + lapacke_sgelss.c + lapacke_sgelss_work.c + lapacke_sgelsy.c + lapacke_sgelsy_work.c + lapacke_sgemqrt.c + lapacke_sgemqrt_work.c + lapacke_sgeqlf.c + lapacke_sgeqlf_work.c + lapacke_sgeqp3.c + lapacke_sgeqp3_work.c + lapacke_sgeqpf.c + lapacke_sgeqpf_work.c + lapacke_sgeqr2.c + lapacke_sgeqr2_work.c + lapacke_sgeqrf.c + lapacke_sgeqrf_work.c + lapacke_sgeqrfp.c + lapacke_sgeqrfp_work.c + lapacke_sgeqrt.c + lapacke_sgeqrt2.c + lapacke_sgeqrt2_work.c + lapacke_sgeqrt3.c + lapacke_sgeqrt3_work.c + lapacke_sgeqrt_work.c + lapacke_sgerfs.c + lapacke_sgerfs_work.c + lapacke_sgerqf.c + lapacke_sgerqf_work.c + lapacke_sgesdd.c + lapacke_sgesdd_work.c + lapacke_sgesv.c + lapacke_sgesv_work.c + lapacke_sgesvd.c + lapacke_sgesvd_work.c + lapacke_sgesvj.c + lapacke_sgesvj_work.c + lapacke_sgesvx.c + lapacke_sgesvx_work.c + lapacke_sgetf2.c + lapacke_sgetf2_work.c + lapacke_sgetrf.c + lapacke_sgetrf_work.c + lapacke_sgetri.c + lapacke_sgetri_work.c + lapacke_sgetrs.c + lapacke_sgetrs_work.c + lapacke_sggbak.c + lapacke_sggbak_work.c + lapacke_sggbal.c + lapacke_sggbal_work.c + lapacke_sgges.c + lapacke_sgges_work.c + lapacke_sggesx.c + lapacke_sggesx_work.c + lapacke_sggev.c + lapacke_sggev_work.c + lapacke_sggevx.c + lapacke_sggevx_work.c + lapacke_sggglm.c + lapacke_sggglm_work.c + lapacke_sgghrd.c + lapacke_sgghrd_work.c + lapacke_sgglse.c + lapacke_sgglse_work.c + lapacke_sggqrf.c + lapacke_sggqrf_work.c + lapacke_sggrqf.c + lapacke_sggrqf_work.c + lapacke_sggsvd.c + lapacke_sggsvd_work.c + lapacke_sggsvp.c + lapacke_sggsvp_work.c + lapacke_sgtcon.c + lapacke_sgtcon_work.c + lapacke_sgtrfs.c + lapacke_sgtrfs_work.c + lapacke_sgtsv.c + lapacke_sgtsv_work.c + lapacke_sgtsvx.c + lapacke_sgtsvx_work.c + lapacke_sgttrf.c + lapacke_sgttrf_work.c + lapacke_sgttrs.c + lapacke_sgttrs_work.c + lapacke_shgeqz.c + lapacke_shgeqz_work.c + lapacke_shsein.c + lapacke_shsein_work.c + lapacke_shseqr.c + lapacke_shseqr_work.c + lapacke_slacn2.c + lapacke_slacn2_work.c + lapacke_slacpy.c + lapacke_slacpy_work.c + lapacke_slag2d.c + lapacke_slag2d_work.c + lapacke_slamch.c + lapacke_slamch_work.c + lapacke_slange.c + lapacke_slange_work.c + lapacke_slansy.c + lapacke_slansy_work.c + lapacke_slantr.c + lapacke_slantr_work.c + lapacke_slapmr.c + lapacke_slapmr_work.c + lapacke_slapy2.c + lapacke_slapy2_work.c + lapacke_slapy3.c + lapacke_slapy3_work.c + lapacke_slarfb.c + lapacke_slarfb_work.c + lapacke_slarfg.c + lapacke_slarfg_work.c + lapacke_slarft.c + lapacke_slarft_work.c + lapacke_slarfx.c + lapacke_slarfx_work.c + lapacke_slarnv.c + lapacke_slarnv_work.c + lapacke_slartgp.c + lapacke_slartgp_work.c + lapacke_slartgs.c + lapacke_slartgs_work.c + lapacke_slaset.c + lapacke_slaset_work.c + lapacke_slasrt.c + lapacke_slasrt_work.c + lapacke_slaswp.c + lapacke_slaswp_work.c + lapacke_slauum.c + lapacke_slauum_work.c + lapacke_sopgtr.c + lapacke_sopgtr_work.c + lapacke_sopmtr.c + lapacke_sopmtr_work.c + lapacke_sorbdb.c + lapacke_sorbdb_work.c + lapacke_sorcsd.c + lapacke_sorcsd_work.c + lapacke_sorgbr.c + lapacke_sorgbr_work.c + lapacke_sorghr.c + lapacke_sorghr_work.c + lapacke_sorglq.c + lapacke_sorglq_work.c + lapacke_sorgql.c + lapacke_sorgql_work.c + lapacke_sorgqr.c + lapacke_sorgqr_work.c + lapacke_sorgrq.c + lapacke_sorgrq_work.c + lapacke_sorgtr.c + lapacke_sorgtr_work.c + lapacke_sormbr.c + lapacke_sormbr_work.c + lapacke_sormhr.c + lapacke_sormhr_work.c + lapacke_sormlq.c + lapacke_sormlq_work.c + lapacke_sormql.c + lapacke_sormql_work.c + lapacke_sormqr.c + lapacke_sormqr_work.c + lapacke_sormrq.c + lapacke_sormrq_work.c + lapacke_sormrz.c + lapacke_sormrz_work.c + lapacke_sormtr.c + lapacke_sormtr_work.c + lapacke_spbcon.c + lapacke_spbcon_work.c + lapacke_spbequ.c + lapacke_spbequ_work.c + lapacke_spbrfs.c + lapacke_spbrfs_work.c + lapacke_spbstf.c + lapacke_spbstf_work.c + lapacke_spbsv.c + lapacke_spbsv_work.c + lapacke_spbsvx.c + lapacke_spbsvx_work.c + lapacke_spbtrf.c + lapacke_spbtrf_work.c + lapacke_spbtrs.c + lapacke_spbtrs_work.c + lapacke_spftrf.c + lapacke_spftrf_work.c + lapacke_spftri.c + lapacke_spftri_work.c + lapacke_spftrs.c + lapacke_spftrs_work.c + lapacke_spocon.c + lapacke_spocon_work.c + lapacke_spoequ.c + lapacke_spoequ_work.c + lapacke_spoequb.c + lapacke_spoequb_work.c + lapacke_sporfs.c + lapacke_sporfs_work.c + lapacke_sposv.c + lapacke_sposv_work.c + lapacke_sposvx.c + lapacke_sposvx_work.c + lapacke_spotrf.c + lapacke_spotrf_work.c + lapacke_spotri.c + lapacke_spotri_work.c + lapacke_spotrs.c + lapacke_spotrs_work.c + lapacke_sppcon.c + lapacke_sppcon_work.c + lapacke_sppequ.c + lapacke_sppequ_work.c + lapacke_spprfs.c + lapacke_spprfs_work.c + lapacke_sppsv.c + lapacke_sppsv_work.c + lapacke_sppsvx.c + lapacke_sppsvx_work.c + lapacke_spptrf.c + lapacke_spptrf_work.c + lapacke_spptri.c + lapacke_spptri_work.c + lapacke_spptrs.c + lapacke_spptrs_work.c + lapacke_spstrf.c + lapacke_spstrf_work.c + lapacke_sptcon.c + lapacke_sptcon_work.c + lapacke_spteqr.c + lapacke_spteqr_work.c + lapacke_sptrfs.c + lapacke_sptrfs_work.c + lapacke_sptsv.c + lapacke_sptsv_work.c + lapacke_sptsvx.c + lapacke_sptsvx_work.c + lapacke_spttrf.c + lapacke_spttrf_work.c + lapacke_spttrs.c + lapacke_spttrs_work.c + lapacke_ssbev.c + lapacke_ssbev_work.c + lapacke_ssbevd.c + lapacke_ssbevd_work.c + lapacke_ssbevx.c + lapacke_ssbevx_work.c + lapacke_ssbgst.c + lapacke_ssbgst_work.c + lapacke_ssbgv.c + lapacke_ssbgv_work.c + lapacke_ssbgvd.c + lapacke_ssbgvd_work.c + lapacke_ssbgvx.c + lapacke_ssbgvx_work.c + lapacke_ssbtrd.c + lapacke_ssbtrd_work.c + lapacke_ssfrk.c + lapacke_ssfrk_work.c + lapacke_sspcon.c + lapacke_sspcon_work.c + lapacke_sspev.c + lapacke_sspev_work.c + lapacke_sspevd.c + lapacke_sspevd_work.c + lapacke_sspevx.c + lapacke_sspevx_work.c + lapacke_sspgst.c + lapacke_sspgst_work.c + lapacke_sspgv.c + lapacke_sspgv_work.c + lapacke_sspgvd.c + lapacke_sspgvd_work.c + lapacke_sspgvx.c + lapacke_sspgvx_work.c + lapacke_ssprfs.c + lapacke_ssprfs_work.c + lapacke_sspsv.c + lapacke_sspsv_work.c + lapacke_sspsvx.c + lapacke_sspsvx_work.c + lapacke_ssptrd.c + lapacke_ssptrd_work.c + lapacke_ssptrf.c + lapacke_ssptrf_work.c + lapacke_ssptri.c + lapacke_ssptri_work.c + lapacke_ssptrs.c + lapacke_ssptrs_work.c + lapacke_sstebz.c + lapacke_sstebz_work.c + lapacke_sstedc.c + lapacke_sstedc_work.c + lapacke_sstegr.c + lapacke_sstegr_work.c + lapacke_sstein.c + lapacke_sstein_work.c + lapacke_sstemr.c + lapacke_sstemr_work.c + lapacke_ssteqr.c + lapacke_ssteqr_work.c + lapacke_ssterf.c + lapacke_ssterf_work.c + lapacke_sstev.c + lapacke_sstev_work.c + lapacke_sstevd.c + lapacke_sstevd_work.c + lapacke_sstevr.c + lapacke_sstevr_work.c + lapacke_sstevx.c + lapacke_sstevx_work.c + lapacke_ssycon.c + lapacke_ssycon_work.c + lapacke_ssyconv.c + lapacke_ssyconv_work.c + lapacke_ssyequb.c + lapacke_ssyequb_work.c + lapacke_ssyev.c + lapacke_ssyev_work.c + lapacke_ssyevd.c + lapacke_ssyevd_work.c + lapacke_ssyevr.c + lapacke_ssyevr_work.c + lapacke_ssyevx.c + lapacke_ssyevx_work.c + lapacke_ssygst.c + lapacke_ssygst_work.c + lapacke_ssygv.c + lapacke_ssygv_work.c + lapacke_ssygvd.c + lapacke_ssygvd_work.c + lapacke_ssygvx.c + lapacke_ssygvx_work.c + lapacke_ssyrfs.c + lapacke_ssyrfs_work.c + lapacke_ssysv.c + lapacke_ssysv_rook.c + lapacke_ssysv_rook_work.c + lapacke_ssysv_work.c + lapacke_ssysvx.c + lapacke_ssysvx_work.c + lapacke_ssyswapr.c + lapacke_ssyswapr_work.c + lapacke_ssytrd.c + lapacke_ssytrd_work.c + lapacke_ssytrf.c + lapacke_ssytrf_work.c + lapacke_ssytri.c + lapacke_ssytri2.c + lapacke_ssytri2_work.c + lapacke_ssytri2x.c + lapacke_ssytri2x_work.c + lapacke_ssytri_work.c + lapacke_ssytrs.c + lapacke_ssytrs2.c + lapacke_ssytrs2_work.c + lapacke_ssytrs_work.c + lapacke_stbcon.c + lapacke_stbcon_work.c + lapacke_stbrfs.c + lapacke_stbrfs_work.c + lapacke_stbtrs.c + lapacke_stbtrs_work.c + lapacke_stfsm.c + lapacke_stfsm_work.c + lapacke_stftri.c + lapacke_stftri_work.c + lapacke_stfttp.c + lapacke_stfttp_work.c + lapacke_stfttr.c + lapacke_stfttr_work.c + lapacke_stgevc.c + lapacke_stgevc_work.c + lapacke_stgexc.c + lapacke_stgexc_work.c + lapacke_stgsen.c + lapacke_stgsen_work.c + lapacke_stgsja.c + lapacke_stgsja_work.c + lapacke_stgsna.c + lapacke_stgsna_work.c + lapacke_stgsyl.c + lapacke_stgsyl_work.c + lapacke_stpcon.c + lapacke_stpcon_work.c + lapacke_stpmqrt.c + lapacke_stpmqrt_work.c + lapacke_stpqrt2.c + lapacke_stpqrt2_work.c + lapacke_stprfb.c + lapacke_stprfb_work.c + lapacke_stprfs.c + lapacke_stprfs_work.c + lapacke_stptri.c + lapacke_stptri_work.c + lapacke_stptrs.c + lapacke_stptrs_work.c + lapacke_stpttf.c + lapacke_stpttf_work.c + lapacke_stpttr.c + lapacke_stpttr_work.c + lapacke_strcon.c + lapacke_strcon_work.c + lapacke_strevc.c + lapacke_strevc_work.c + lapacke_strexc.c + lapacke_strexc_work.c + lapacke_strrfs.c + lapacke_strrfs_work.c + lapacke_strsen.c + lapacke_strsen_work.c + lapacke_strsna.c + lapacke_strsna_work.c + lapacke_strsyl.c + lapacke_strsyl_work.c + lapacke_strtri.c + lapacke_strtri_work.c + lapacke_strtrs.c + lapacke_strtrs_work.c + lapacke_strttf.c + lapacke_strttf_work.c + lapacke_strttp.c + lapacke_strttp_work.c + lapacke_stzrzf.c + lapacke_stzrzf_work.c +) + +set(ZSRC + lapacke_zbbcsd.c + lapacke_zbbcsd_work.c + lapacke_zbdsqr.c + lapacke_zbdsqr_work.c + lapacke_zcgesv.c + lapacke_zcgesv_work.c + lapacke_zcposv.c + lapacke_zcposv_work.c + lapacke_zgbbrd.c + lapacke_zgbbrd_work.c + lapacke_zgbcon.c + lapacke_zgbcon_work.c + lapacke_zgbequ.c + lapacke_zgbequ_work.c + lapacke_zgbequb.c + lapacke_zgbequb_work.c + lapacke_zgbrfs.c + lapacke_zgbrfs_work.c + lapacke_zgbsv.c + lapacke_zgbsv_work.c + lapacke_zgbsvx.c + lapacke_zgbsvx_work.c + lapacke_zgbtrf.c + lapacke_zgbtrf_work.c + lapacke_zgbtrs.c + lapacke_zgbtrs_work.c + lapacke_zgebak.c + lapacke_zgebak_work.c + lapacke_zgebal.c + lapacke_zgebal_work.c + lapacke_zgebrd.c + lapacke_zgebrd_work.c + lapacke_zgecon.c + lapacke_zgecon_work.c + lapacke_zgeequ.c + lapacke_zgeequ_work.c + lapacke_zgeequb.c + lapacke_zgeequb_work.c + lapacke_zgees.c + lapacke_zgees_work.c + lapacke_zgeesx.c + lapacke_zgeesx_work.c + lapacke_zgeev.c + lapacke_zgeev_work.c + lapacke_zgeevx.c + lapacke_zgeevx_work.c + lapacke_zgehrd.c + lapacke_zgehrd_work.c + lapacke_zgelq2.c + lapacke_zgelq2_work.c + lapacke_zgelqf.c + lapacke_zgelqf_work.c + lapacke_zgels.c + lapacke_zgels_work.c + lapacke_zgelsd.c + lapacke_zgelsd_work.c + lapacke_zgelss.c + lapacke_zgelss_work.c + lapacke_zgelsy.c + lapacke_zgelsy_work.c + lapacke_zgemqrt.c + lapacke_zgemqrt_work.c + lapacke_zgeqlf.c + lapacke_zgeqlf_work.c + lapacke_zgeqp3.c + lapacke_zgeqp3_work.c + lapacke_zgeqpf.c + lapacke_zgeqpf_work.c + lapacke_zgeqr2.c + lapacke_zgeqr2_work.c + lapacke_zgeqrf.c + lapacke_zgeqrf_work.c + lapacke_zgeqrfp.c + lapacke_zgeqrfp_work.c + lapacke_zgeqrt.c + lapacke_zgeqrt2.c + lapacke_zgeqrt2_work.c + lapacke_zgeqrt3.c + lapacke_zgeqrt3_work.c + lapacke_zgeqrt_work.c + lapacke_zgerfs.c + lapacke_zgerfs_work.c + lapacke_zgerqf.c + lapacke_zgerqf_work.c + lapacke_zgesdd.c + lapacke_zgesdd_work.c + lapacke_zgesv.c + lapacke_zgesv_work.c + lapacke_zgesvd.c + lapacke_zgesvd_work.c + lapacke_zgesvx.c + lapacke_zgesvx_work.c + lapacke_zgetf2.c + lapacke_zgetf2_work.c + lapacke_zgetrf.c + lapacke_zgetrf_work.c + lapacke_zgetri.c + lapacke_zgetri_work.c + lapacke_zgetrs.c + lapacke_zgetrs_work.c + lapacke_zggbak.c + lapacke_zggbak_work.c + lapacke_zggbal.c + lapacke_zggbal_work.c + lapacke_zgges.c + lapacke_zgges_work.c + lapacke_zggesx.c + lapacke_zggesx_work.c + lapacke_zggev.c + lapacke_zggev_work.c + lapacke_zggevx.c + lapacke_zggevx_work.c + lapacke_zggglm.c + lapacke_zggglm_work.c + lapacke_zgghrd.c + lapacke_zgghrd_work.c + lapacke_zgglse.c + lapacke_zgglse_work.c + lapacke_zggqrf.c + lapacke_zggqrf_work.c + lapacke_zggrqf.c + lapacke_zggrqf_work.c + lapacke_zggsvd.c + lapacke_zggsvd_work.c + lapacke_zggsvp.c + lapacke_zggsvp_work.c + lapacke_zgtcon.c + lapacke_zgtcon_work.c + lapacke_zgtrfs.c + lapacke_zgtrfs_work.c + lapacke_zgtsv.c + lapacke_zgtsv_work.c + lapacke_zgtsvx.c + lapacke_zgtsvx_work.c + lapacke_zgttrf.c + lapacke_zgttrf_work.c + lapacke_zgttrs.c + lapacke_zgttrs_work.c + lapacke_zhbev.c + lapacke_zhbev_work.c + lapacke_zhbevd.c + lapacke_zhbevd_work.c + lapacke_zhbevx.c + lapacke_zhbevx_work.c + lapacke_zhbgst.c + lapacke_zhbgst_work.c + lapacke_zhbgv.c + lapacke_zhbgv_work.c + lapacke_zhbgvd.c + lapacke_zhbgvd_work.c + lapacke_zhbgvx.c + lapacke_zhbgvx_work.c + lapacke_zhbtrd.c + lapacke_zhbtrd_work.c + lapacke_zhecon.c + lapacke_zhecon_work.c + lapacke_zheequb.c + lapacke_zheequb_work.c + lapacke_zheev.c + lapacke_zheev_work.c + lapacke_zheevd.c + lapacke_zheevd_work.c + lapacke_zheevr.c + lapacke_zheevr_work.c + lapacke_zheevx.c + lapacke_zheevx_work.c + lapacke_zhegst.c + lapacke_zhegst_work.c + lapacke_zhegv.c + lapacke_zhegv_work.c + lapacke_zhegvd.c + lapacke_zhegvd_work.c + lapacke_zhegvx.c + lapacke_zhegvx_work.c + lapacke_zherfs.c + lapacke_zherfs_work.c + lapacke_zhesv.c + lapacke_zhesv_work.c + lapacke_zhesvx.c + lapacke_zhesvx_work.c + lapacke_zheswapr.c + lapacke_zheswapr_work.c + lapacke_zhetrd.c + lapacke_zhetrd_work.c + lapacke_zhetrf.c + lapacke_zhetrf_work.c + lapacke_zhetri.c + lapacke_zhetri2.c + lapacke_zhetri2_work.c + lapacke_zhetri2x.c + lapacke_zhetri2x_work.c + lapacke_zhetri_work.c + lapacke_zhetrs.c + lapacke_zhetrs2.c + lapacke_zhetrs2_work.c + lapacke_zhetrs_work.c + lapacke_zhfrk.c + lapacke_zhfrk_work.c + lapacke_zhgeqz.c + lapacke_zhgeqz_work.c + lapacke_zhpcon.c + lapacke_zhpcon_work.c + lapacke_zhpev.c + lapacke_zhpev_work.c + lapacke_zhpevd.c + lapacke_zhpevd_work.c + lapacke_zhpevx.c + lapacke_zhpevx_work.c + lapacke_zhpgst.c + lapacke_zhpgst_work.c + lapacke_zhpgv.c + lapacke_zhpgv_work.c + lapacke_zhpgvd.c + lapacke_zhpgvd_work.c + lapacke_zhpgvx.c + lapacke_zhpgvx_work.c + lapacke_zhprfs.c + lapacke_zhprfs_work.c + lapacke_zhpsv.c + lapacke_zhpsv_work.c + lapacke_zhpsvx.c + lapacke_zhpsvx_work.c + lapacke_zhptrd.c + lapacke_zhptrd_work.c + lapacke_zhptrf.c + lapacke_zhptrf_work.c + lapacke_zhptri.c + lapacke_zhptri_work.c + lapacke_zhptrs.c + lapacke_zhptrs_work.c + lapacke_zhsein.c + lapacke_zhsein_work.c + lapacke_zhseqr.c + lapacke_zhseqr_work.c + lapacke_zlacgv.c + lapacke_zlacgv_work.c + lapacke_zlacn2.c + lapacke_zlacn2_work.c + lapacke_zlacp2.c + lapacke_zlacp2_work.c + lapacke_zlacpy.c + lapacke_zlacpy_work.c + lapacke_zlag2c.c + lapacke_zlag2c_work.c + lapacke_zlange.c + lapacke_zlange_work.c + lapacke_zlanhe.c + lapacke_zlanhe_work.c + lapacke_zlansy.c + lapacke_zlansy_work.c + lapacke_zlantr.c + lapacke_zlantr_work.c + lapacke_zlapmr.c + lapacke_zlapmr_work.c + lapacke_zlarfb.c + lapacke_zlarfb_work.c + lapacke_zlarfg.c + lapacke_zlarfg_work.c + lapacke_zlarft.c + lapacke_zlarft_work.c + lapacke_zlarfx.c + lapacke_zlarfx_work.c + lapacke_zlarnv.c + lapacke_zlarnv_work.c + lapacke_zlaset.c + lapacke_zlaset_work.c + lapacke_zlaswp.c + lapacke_zlaswp_work.c + lapacke_zlauum.c + lapacke_zlauum_work.c + lapacke_zpbcon.c + lapacke_zpbcon_work.c + lapacke_zpbequ.c + lapacke_zpbequ_work.c + lapacke_zpbrfs.c + lapacke_zpbrfs_work.c + lapacke_zpbstf.c + lapacke_zpbstf_work.c + lapacke_zpbsv.c + lapacke_zpbsv_work.c + lapacke_zpbsvx.c + lapacke_zpbsvx_work.c + lapacke_zpbtrf.c + lapacke_zpbtrf_work.c + lapacke_zpbtrs.c + lapacke_zpbtrs_work.c + lapacke_zpftrf.c + lapacke_zpftrf_work.c + lapacke_zpftri.c + lapacke_zpftri_work.c + lapacke_zpftrs.c + lapacke_zpftrs_work.c + lapacke_zpocon.c + lapacke_zpocon_work.c + lapacke_zpoequ.c + lapacke_zpoequ_work.c + lapacke_zpoequb.c + lapacke_zpoequb_work.c + lapacke_zporfs.c + lapacke_zporfs_work.c + lapacke_zposv.c + lapacke_zposv_work.c + lapacke_zposvx.c + lapacke_zposvx_work.c + lapacke_zpotrf.c + lapacke_zpotrf_work.c + lapacke_zpotri.c + lapacke_zpotri_work.c + lapacke_zpotrs.c + lapacke_zpotrs_work.c + lapacke_zppcon.c + lapacke_zppcon_work.c + lapacke_zppequ.c + lapacke_zppequ_work.c + lapacke_zpprfs.c + lapacke_zpprfs_work.c + lapacke_zppsv.c + lapacke_zppsv_work.c + lapacke_zppsvx.c + lapacke_zppsvx_work.c + lapacke_zpptrf.c + lapacke_zpptrf_work.c + lapacke_zpptri.c + lapacke_zpptri_work.c + lapacke_zpptrs.c + lapacke_zpptrs_work.c + lapacke_zpstrf.c + lapacke_zpstrf_work.c + lapacke_zptcon.c + lapacke_zptcon_work.c + lapacke_zpteqr.c + lapacke_zpteqr_work.c + lapacke_zptrfs.c + lapacke_zptrfs_work.c + lapacke_zptsv.c + lapacke_zptsv_work.c + lapacke_zptsvx.c + lapacke_zptsvx_work.c + lapacke_zpttrf.c + lapacke_zpttrf_work.c + lapacke_zpttrs.c + lapacke_zpttrs_work.c + lapacke_zspcon.c + lapacke_zspcon_work.c + lapacke_zsprfs.c + lapacke_zsprfs_work.c + lapacke_zspsv.c + lapacke_zspsv_work.c + lapacke_zspsvx.c + lapacke_zspsvx_work.c + lapacke_zsptrf.c + lapacke_zsptrf_work.c + lapacke_zsptri.c + lapacke_zsptri_work.c + lapacke_zsptrs.c + lapacke_zsptrs_work.c + lapacke_zstedc.c + lapacke_zstedc_work.c + lapacke_zstegr.c + lapacke_zstegr_work.c + lapacke_zstein.c + lapacke_zstein_work.c + lapacke_zstemr.c + lapacke_zstemr_work.c + lapacke_zsteqr.c + lapacke_zsteqr_work.c + lapacke_zsycon.c + lapacke_zsycon_work.c + lapacke_zsyconv.c + lapacke_zsyconv_work.c + lapacke_zsyequb.c + lapacke_zsyequb_work.c + lapacke_zsyrfs.c + lapacke_zsyrfs_work.c + lapacke_zsysv.c + lapacke_zsysv_rook.c + lapacke_zsysv_rook_work.c + lapacke_zsysv_work.c + lapacke_zsysvx.c + lapacke_zsysvx_work.c + lapacke_zsyswapr.c + lapacke_zsyswapr_work.c + lapacke_zsytrf.c + lapacke_zsytrf_work.c + lapacke_zsytri.c + lapacke_zsytri2.c + lapacke_zsytri2_work.c + lapacke_zsytri2x.c + lapacke_zsytri2x_work.c + lapacke_zsytri_work.c + lapacke_zsytrs.c + lapacke_zsytrs2.c + lapacke_zsytrs2_work.c + lapacke_zsytrs_work.c + lapacke_ztbcon.c + lapacke_ztbcon_work.c + lapacke_ztbrfs.c + lapacke_ztbrfs_work.c + lapacke_ztbtrs.c + lapacke_ztbtrs_work.c + lapacke_ztfsm.c + lapacke_ztfsm_work.c + lapacke_ztftri.c + lapacke_ztftri_work.c + lapacke_ztfttp.c + lapacke_ztfttp_work.c + lapacke_ztfttr.c + lapacke_ztfttr_work.c + lapacke_ztgevc.c + lapacke_ztgevc_work.c + lapacke_ztgexc.c + lapacke_ztgexc_work.c + lapacke_ztgsen.c + lapacke_ztgsen_work.c + lapacke_ztgsja.c + lapacke_ztgsja_work.c + lapacke_ztgsna.c + lapacke_ztgsna_work.c + lapacke_ztgsyl.c + lapacke_ztgsyl_work.c + lapacke_ztpcon.c + lapacke_ztpcon_work.c + lapacke_ztpmqrt.c + lapacke_ztpmqrt_work.c + lapacke_ztpqrt.c + lapacke_ztpqrt2.c + lapacke_ztpqrt2_work.c + lapacke_ztpqrt_work.c + lapacke_ztprfb.c + lapacke_ztprfb_work.c + lapacke_ztprfs.c + lapacke_ztprfs_work.c + lapacke_ztptri.c + lapacke_ztptri_work.c + lapacke_ztptrs.c + lapacke_ztptrs_work.c + lapacke_ztpttf.c + lapacke_ztpttf_work.c + lapacke_ztpttr.c + lapacke_ztpttr_work.c + lapacke_ztrcon.c + lapacke_ztrcon_work.c + lapacke_ztrevc.c + lapacke_ztrevc_work.c + lapacke_ztrexc.c + lapacke_ztrexc_work.c + lapacke_ztrrfs.c + lapacke_ztrrfs_work.c + lapacke_ztrsen.c + lapacke_ztrsen_work.c + lapacke_ztrsna.c + lapacke_ztrsna_work.c + lapacke_ztrsyl.c + lapacke_ztrsyl_work.c + lapacke_ztrtri.c + lapacke_ztrtri_work.c + lapacke_ztrtrs.c + lapacke_ztrtrs_work.c + lapacke_ztrttf.c + lapacke_ztrttf_work.c + lapacke_ztrttp.c + lapacke_ztrttp_work.c + lapacke_ztzrzf.c + lapacke_ztzrzf_work.c + lapacke_zunbdb.c + lapacke_zunbdb_work.c + lapacke_zuncsd.c + lapacke_zuncsd_work.c + lapacke_zungbr.c + lapacke_zungbr_work.c + lapacke_zunghr.c + lapacke_zunghr_work.c + lapacke_zunglq.c + lapacke_zunglq_work.c + lapacke_zungql.c + lapacke_zungql_work.c + lapacke_zungqr.c + lapacke_zungqr_work.c + lapacke_zungrq.c + lapacke_zungrq_work.c + lapacke_zungtr.c + lapacke_zungtr_work.c + lapacke_zunmbr.c + lapacke_zunmbr_work.c + lapacke_zunmhr.c + lapacke_zunmhr_work.c + lapacke_zunmlq.c + lapacke_zunmlq_work.c + lapacke_zunmql.c + lapacke_zunmql_work.c + lapacke_zunmqr.c + lapacke_zunmqr_work.c + lapacke_zunmrq.c + lapacke_zunmrq_work.c + lapacke_zunmrz.c + lapacke_zunmrz_work.c + lapacke_zunmtr.c + lapacke_zunmtr_work.c + lapacke_zupgtr.c + lapacke_zupgtr_work.c + lapacke_zupmtr.c + lapacke_zupmtr_work.c + lapacke_zsyr.c + lapacke_csyr.c + lapacke_zsyr_work.c + lapacke_csyr_work.c + lapacke_ilaver.c +) + +set(SRCX + lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c + lapacke_cgbrfsx_work.c lapacke_cporfsx_work.c lapacke_dgerfsx_work.c lapacke_sgbrfsx_work.c lapacke_ssyrfsx_work.c lapacke_zherfsx_work.c + lapacke_cgerfsx.c lapacke_csyrfsx.c lapacke_dporfsx.c lapacke_sgerfsx.c lapacke_zgbrfsx.c lapacke_zporfsx.c + lapacke_cgerfsx_work.c lapacke_csyrfsx_work.c lapacke_dporfsx_work.c lapacke_sgerfsx_work.c lapacke_zgbrfsx_work.c lapacke_zporfsx_work.c + lapacke_cherfsx.c lapacke_dgbrfsx.c lapacke_dsyrfsx.c lapacke_sporfsx.c lapacke_zgerfsx.c lapacke_zsyrfsx.c + lapacke_cherfsx_work.c lapacke_dgbrfsx_work.c lapacke_dsyrfsx_work.c lapacke_sporfsx_work.c lapacke_zgerfsx_work.c lapacke_zsyrfsx_work.c + lapacke_cgbsvxx.c lapacke_cposvxx.c lapacke_dgesvxx.c lapacke_sgbsvxx.c lapacke_ssysvxx.c lapacke_zhesvxx.c + lapacke_cgbsvxx_work.c lapacke_cposvxx_work.c lapacke_dgesvxx_work.c lapacke_sgbsvxx_work.c lapacke_ssysvxx_work.c lapacke_zhesvxx_work.c + lapacke_cgesvxx.c lapacke_csysvxx.c lapacke_dposvxx.c lapacke_sgesvxx.c lapacke_zgbsvxx.c lapacke_zposvxx.c + lapacke_cgesvxx_work.c lapacke_csysvxx_work.c lapacke_dposvxx_work.c lapacke_sgesvxx_work.c lapacke_zgbsvxx_work.c lapacke_zposvxx_work.c + lapacke_chesvxx.c lapacke_dgbsvxx.c lapacke_dsysvxx.c lapacke_sposvxx.c lapacke_zgesvxx.c lapacke_zsysvxx.c + lapacke_chesvxx_work.c lapacke_dgbsvxx_work.c lapacke_dsysvxx_work.c lapacke_sposvxx_work.c lapacke_zgesvxx_work.c lapacke_zsysvxx_work.c +) + + +# FILE PARTS OF TMGLIB +set(MATGEN + lapacke_clatms.c + lapacke_clatms_work.c + lapacke_dlatms.c + lapacke_dlatms_work.c + lapacke_slatms.c + lapacke_slatms_work.c + lapacke_zlatms.c + lapacke_zlatms_work.c + lapacke_clagge.c + lapacke_clagge_work.c + lapacke_dlagge.c + lapacke_dlagge_work.c + lapacke_slagge.c + lapacke_slagge_work.c + lapacke_zlagge.c + lapacke_zlagge_work.c + lapacke_claghe.c + lapacke_claghe_work.c + lapacke_zlaghe.c + lapacke_zlaghe_work.c + lapacke_clagsy.c + lapacke_clagsy_work.c + lapacke_dlagsy.c + lapacke_dlagsy_work.c + lapacke_slagsy.c + lapacke_slagsy_work.c + lapacke_zlagsy.c + lapacke_zlagsy_work.c +) + +set(LAPACKE_REL_SRC "") +if (BUILD_SINGLE) + list(APPEND LAPACKE_REL_SRC ${SSRC}) +endif () + +if (BUILD_DOUBLE) + list(APPEND LAPACKE_REL_SRC ${DSRC}) +endif () + +if (BUILD_COMPLEX) + list(APPEND LAPACKE_REL_SRC ${CSRC}) +endif () + +if (BUILD_COMPLEX16) + list(APPEND LAPACKE_REL_SRC ${ZSRC}) +endif () + +# add lapack-netlib folder to the sources +set(LAPACKE_SOURCES "") +foreach (LAE_FILE ${LAPACKE_REL_SRC}) + list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/lapacke/SRC/${LAE_FILE}") +endforeach () + +set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/lapacke/include") +execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h") +include_directories(${lapacke_include_dir}) +set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") From 87336b9acf1216927b911f19e8417035f420f69c Mon Sep 17 00:00:00 2001 From: xantares Date: Thu, 6 Aug 2015 20:03:50 +0200 Subject: [PATCH 210/257] install OpenBLASConfigVersion.cmake --- Makefile.install | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Makefile.install b/Makefile.install index a5814e55a..9814302b0 100644 --- a/Makefile.install +++ b/Makefile.install @@ -11,6 +11,7 @@ OPENBLAS_BINARY_DIR := $(PREFIX)/bin OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake +OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake .PHONY : install .NOTPARALLEL : install @@ -97,6 +98,7 @@ endif @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + ifndef NO_SHARED #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) @@ -112,5 +114,16 @@ else #only static @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) endif +#Generating OpenBLASConfigVersion.cmake + @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) + @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo Install OK! From f874465bb81d10e7cdb88a10cff7d62df3fe370c Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 10 Aug 2015 14:10:44 -0500 Subject: [PATCH 211/257] Use cmake to build OpenBLAS GENERIC Target on MSVC x86 64-bit. Disable CBLAS and LAPACK. --- CMakeLists.txt | 20 ++++++++--- cmake/export.cmake | 60 +++++++++++++++++++++++++++++++ cmake/f_check.cmake | 3 ++ cmake/kernel.cmake | 15 +++++--- cmake/prebuild.cmake | 9 +++-- cmake/system.cmake | 15 ++++++++ cmake/utils.cmake | 4 +++ common.h | 45 ++++++++++++++++++----- common_x86_64.h | 35 +++++++++++++++--- driver/level2/CMakeLists.txt | 59 ++++++++++++++++++++++++++++++ driver/level2/gbmv_thread.c | 2 +- driver/level2/sbmv_thread.c | 2 +- driver/level2/spmv_thread.c | 2 +- driver/level2/tbmv_thread.c | 2 +- driver/level2/tpmv_thread.c | 2 +- driver/level2/trmv_thread.c | 2 +- driver/level2/zgbmv_k.c | 2 +- driver/level2/zhbmv_k.c | 10 +++--- driver/level2/zhpmv_k.c | 10 +++--- driver/level2/zsbmv_k.c | 6 ++-- driver/level2/zspmv_k.c | 3 +- driver/level2/ztbmv_L.c | 2 +- driver/level2/ztbmv_U.c | 2 +- driver/level2/ztbsv_L.c | 2 +- driver/level2/ztbsv_U.c | 2 +- driver/level2/ztpmv_L.c | 2 +- driver/level2/ztpmv_U.c | 2 +- driver/level2/ztpsv_L.c | 2 +- driver/level2/ztpsv_U.c | 2 +- driver/level2/ztrmv_L.c | 2 +- driver/level2/ztrmv_U.c | 2 +- driver/level2/ztrsv_L.c | 2 +- driver/level2/ztrsv_U.c | 2 +- driver/level3/CMakeLists.txt | 37 ++++++++++++++----- driver/others/CMakeLists.txt | 2 ++ interface/CMakeLists.txt | 39 ++++++++++++++++++-- interface/rotg.c | 3 +- interface/zaxpby.c | 4 +-- interface/zdot.c | 24 +++++++------ interface/zgemv.c | 17 +++++---- interface/zrotg.c | 20 +++++++---- kernel/CMakeLists.txt | 70 ++++++++++++++++++++++++++++++------ kernel/Makefile.L3 | 2 +- kernel/arm/zaxpby.c | 7 ++-- kernel/arm/zaxpy.c | 6 ++-- kernel/arm/zcopy.c | 6 ++-- kernel/arm/zdot.c | 18 +++++----- kernel/arm/zrot.c | 6 ++-- kernel/arm/zswap.c | 6 ++-- kernel/x86_64/KERNEL.generic | 6 ++++ openblas_config_template.h | 3 +- 51 files changed, 488 insertions(+), 120 deletions(-) create mode 100644 cmake/export.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d2e5d3c6..610cc9c90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,11 +15,13 @@ enable_language(C) set(OpenBLAS_LIBNAME openblas) ####### -option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS and CBLAS)" ON) +option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) +option(BUILD_WITHOUT_CBLAS "Without CBLAS" ON) option(BUILD_DEBUG "Build Debug Version" OFF) ####### if(BUILD_WITHOUT_LAPACK) set(NO_LAPACK 1) +set(NO_LAPACKE 1) endif() if(BUILD_DEBUG) @@ -27,6 +29,11 @@ set(CMAKE_BUILD_TYPE Debug) else() set(CMAKE_BUILD_TYPE Release) endif() + +if(BUILD_WITHOUT_CBLAS) +set(NO_CBLAS 1) +endif() + ####### @@ -51,7 +58,6 @@ endif () set(SUBDIRS ${BLASDIRS}) if (NOT NO_LAPACK) - message ("error 1") list(APPEND SUBDIRS lapack) endif () @@ -111,15 +117,21 @@ endforeach () # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. if (NOT NOFORTRAN AND NOT NO_LAPACK) - message ("error 2") include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") if (NOT NO_LAPACKE) include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake") endif () endif () +#Only generate .def for dll on MSVC +if(MSVC) +set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") +endif() + # add objects to the openblas lib -add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS}) +add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${PROJECT_BINARY_DIR}/openblas.def) + +include("${CMAKE_SOURCE_DIR}/cmake/export.cmake") #only build shared library for MSVC if(NOT MSVC) diff --git a/cmake/export.cmake b/cmake/export.cmake new file mode 100644 index 000000000..adf59101f --- /dev/null +++ b/cmake/export.cmake @@ -0,0 +1,60 @@ + +#Only generate .def for dll on MSVC +if(MSVC) + +set_source_files_properties(${OpenBLAS_DEF_FILE} PROPERTIES GENERATED 1) + +if (NOT DEFINED ARCH) + set(ARCH_IN "x86_64") +else() + set(ARCH_IN ${ARCH}) +endif() + +if (${CORE} STREQUAL "generic") + set(ARCH_IN "GENERIC") +endif () + +if (NOT DEFINED EXPRECISION) + set(EXPRECISION_IN 0) +else() + set(EXPRECISION_IN ${EXPRECISION}) +endif() + +if (NOT DEFINED NO_CBLAS) + set(NO_CBLAS_IN 0) +else() + set(NO_CBLAS_IN ${NO_CBLAS}) +endif() + +if (NOT DEFINED NO_LAPACK) + set(NO_LAPACK_IN 0) +else() + set(NO_LAPACK_IN ${NO_LAPACK}) +endif() + +if (NOT DEFINED NO_LAPACKE) + set(NO_LAPACKE_IN 0) +else() + set(NO_LAPACKE_IN ${NO_LAPACKE}) +endif() + +if (NOT DEFINED NEED2UNDERSCORES) + set(NEED2UNDERSCORES_IN 0) +else() + set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) +endif() + +if (NOT DEFINED ONLY_CBLAS) + set(ONLY_CBLAS_IN 0) +else() + set(ONLY_CBLAS_IN ${ONLY_CBLAS}) +endif() + +add_custom_command( + TARGET ${OpenBLAS_LIBNAME} PRE_LINK + COMMAND perl + ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" + COMMENT "Create openblas.def file" + VERBATIM) + +endif() \ No newline at end of file diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index f7651db56..e189b683a 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -25,7 +25,10 @@ if (MSVC) include(CMakeForceCompiler) CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) endif () + +if (NOT NO_LAPACK) enable_language(Fortran) +endif() if (NOT ONLY_CBLAS) # N.B. f_check is not cross-platform, so instead try to use CMake variables diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 3a4d13837..c2ee62545 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -99,10 +99,10 @@ macro(SetDefaultL1) set(QGEMVTKERNEL gemv_t.S) set(XGEMVNKERNEL zgemv_n.S) set(XGEMVTKERNEL zgemv_t.S) - set(SCABS_KERNEL cabs.S) - set(DCABS_KERNEL cabs.S) - set(QCABS_KERNEL cabs.S) - set(LSAME_KERNEL lsame.S) + set(SCABS_KERNEL ../generic/cabs.c) + set(DCABS_KERNEL ../generic/cabs.S) + set(QCABS_KERNEL ../generic/cabs.S) + set(LSAME_KERNEL ../generic/lsame.c) set(SAXPBYKERNEL ../arm/axpby.c) set(DAXPBYKERNEL ../arm/axpby.c) set(CAXPBYKERNEL ../arm/zaxpby.c) @@ -156,3 +156,10 @@ macro(SetDefaultL2) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) endmacro () + +macro(SetDefaultL3) + set(SGEADD_KERNEL ../generic/geadd.c) + set(DGEADD_KERNEL ../generic/geadd.c) + set(CGEADD_KERNEL ../generic/zgeadd.c) + set(ZGEADD_KERNEL ../generic/zgeadd.c) +endmacro () \ No newline at end of file diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 901c237c4..c3fa48655 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -66,6 +66,11 @@ if (NOT MSVC) list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S) endif () +if (MSVC) +#Use generic for MSVC now +set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) +endif() + set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) @@ -73,7 +78,7 @@ try_compile(GETARCH_RESULT ${GETARCH_DIR} SOURCES ${GETARCH_SRC} COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE GETARCH_LOG - COPY_FILE ${GETARCH_BIN} + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} ) message(STATUS "Running getarch") @@ -95,7 +100,7 @@ try_compile(GETARCH2_RESULT ${GETARCH2_DIR} SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE GETARCH2_LOG - COPY_FILE ${GETARCH2_BIN} + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} ) # use the cmake binary w/ the -E param to run a shell command in a cross-platform way diff --git a/cmake/system.cmake b/cmake/system.cmake index 36f9b7cbd..8ec738a10 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -420,6 +420,21 @@ if (ONLY_CBLAS) set(LIB_COMPONENTS CBLAS) endif () + +# For GEMM3M +set(USE_GEMM3M 0) + +if (DEFINED ARCH) + if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64" OR ${ARCH} STREQUAL "ia64" OR ${ARCH} STREQUAL "MIPS") + set(USE_GEMM3M 1) + endif () + + if (${CORE} STREQUAL "generic") + set(USE_GEMM3M 0) + endif () +endif () + + #export OSNAME #export ARCH #export CORE diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 498c3840a..6e2a98069 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -102,6 +102,7 @@ endfunction () # 1 - compiles the sources for non-complex types only (SINGLE/DOUBLE) # 2 - compiles for complex types only (COMPLEX/DOUBLE COMPLEX) # 3 - compiles for all types, but changes source names for complex by prepending z (e.g. axpy.c becomes zaxpy.c) +# 4 - compiles for complex types only, but changes source names for complex by prepending z (e.g. hemv.c becomes zhemv.c) # STRING - compiles only the given type (e.g. DOUBLE) function(GenerateNamedObjects sources_in) @@ -151,6 +152,9 @@ function(GenerateNamedObjects sources_in) set(complex_only true) elseif (${ARGV7} EQUAL 3) set(mangle_complex_sources true) + elseif (${ARGV7} EQUAL 4) + set(mangle_complex_sources true) + set(complex_only true) elseif (NOT ${ARGV7} EQUAL 0) set(float_list ${ARGV7}) endif () diff --git a/common.h b/common.h index 1894a5c86..1fb2c7eaf 100644 --- a/common.h +++ b/common.h @@ -296,13 +296,6 @@ typedef int blasint; #define COMPSIZE 2 #endif -#if defined(C_PGI) || defined(C_SUN) -#define CREAL(X) (*((FLOAT *)&X + 0)) -#define CIMAG(X) (*((FLOAT *)&X + 1)) -#else -#define CREAL __real__ -#define CIMAG __imag__ -#endif #define Address_H(x) (((x)+(1<<15))>>16) #define Address_L(x) ((x)-((Address_H(x))<<16)) @@ -464,17 +457,49 @@ typedef char* env_var_t; extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ - (__GNUC__ >= 3 && !defined(__cplusplus))) + (__GNUC__ >= 3 && !defined(__cplusplus)) || \ + _MSC_VER >= 1800) // Visual Studio 2013 supports complex #define OPENBLAS_COMPLEX_C99 typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; typedef xdouble _Complex openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) #else #define OPENBLAS_COMPLEX_STRUCT typedef struct { float real, imag; } openblas_complex_float; typedef struct { double real, imag; } openblas_complex_double; typedef struct { xdouble real, imag; } openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) {(real), (imag)} + #define openblas_make_complex_double(real, imag) {(real), (imag)} + #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} #endif + +#ifdef XDOUBLE +#define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble +#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i) +#elif defined(DOUBLE) +#define OPENBLAS_COMPLEX_FLOAT openblas_complex_double +#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_double(r,i) +#else +#define OPENBLAS_COMPLEX_FLOAT openblas_complex_float +#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_float(r,i) +#endif + +#if defined(C_PGI) || defined(C_SUN) +#define CREAL(X) (*((FLOAT *)&X + 0)) +#define CIMAG(X) (*((FLOAT *)&X + 1)) +#else +#ifdef OPENBLAS_COMPLEX_STRUCT +#define CREAL(Z) ((Z).real) +#define CIMAG(Z) ((Z).imag) +#else +#define CREAL __real__ +#define CIMAG __imag__ +#endif +#endif + #endif // ASSEMBLER #ifndef IFLUSH @@ -491,6 +516,10 @@ typedef char* env_var_t; #endif #endif +#if defined(C_MSVC) +#define inline __inline +#endif + #ifndef ASSEMBLER #ifndef MIN diff --git a/common_x86_64.h b/common_x86_64.h index efb902416..8bb87c7c0 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -41,6 +41,10 @@ #ifndef ASSEMBLER +#ifdef C_MSVC +#include +#endif + #ifdef C_SUN #define __asm__ __asm #define __volatile__ @@ -61,30 +65,39 @@ static void __inline blas_lock(volatile BLASULONG *address){ - int ret; + BLASULONG ret; do { while (*address) {YIELDING;}; +#ifndef C_MSVC __asm__ __volatile__( "xchgl %0, %1\n" : "=r"(ret), "=m"(*address) : "0"(1), "m"(*address) : "memory"); - +#else + ret=InterlockedExchange64((volatile LONG64 *)(address), 1); +#endif } while (ret); + } static __inline BLASULONG rpcc(void){ +#ifdef C_MSVC + return __rdtsc(); +#else BLASULONG a, d; __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); return ((BLASULONG)a + ((BLASULONG)d << 32)); +#endif } #define RPCC64BIT +#ifndef C_MSVC static __inline BLASULONG getstackaddr(void){ BLASULONG addr; @@ -93,22 +106,32 @@ static __inline BLASULONG getstackaddr(void){ return addr; } +#endif static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ +#ifdef C_MSVC + int cpuinfo[4]; + __cpuid(cpuinfo, op); + *eax=cpuinfo[0]; + *ebx=cpuinfo[1]; + *ecx=cpuinfo[2]; + *edx=cpuinfo[3]; +#else __asm__ __volatile__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op)); +#endif } /* #define WHEREAMI */ -static inline int WhereAmI(void){ +static __inline int WhereAmI(void){ int eax, ebx, ecx, edx; int apicid; @@ -150,10 +173,14 @@ static inline int WhereAmI(void){ #define GET_IMAGE_CANCEL #ifdef SMP -#ifdef USE64BITINT +#if defined(USE64BITINT) static __inline blasint blas_quickdivide(blasint x, blasint y){ return x / y; } +#elif defined (C_MSVC) +static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){ + return x / y; +} #else extern unsigned int blas_quick_divide_table[]; diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index e4440be6d..5db4fb5ee 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -46,12 +46,28 @@ set(NU_SMP_SOURCES tbmv_thread.c ) +set(ULVM_COMPLEX_SOURCES + hbmv_k.c + hpmv_k.c + hpr_k.c + hpr2_k.c + her_k.c + her2_k.c +) + # objects that need LOWER set GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "" 1 "" "" 3) # gbmv uses a lowercase n and t GenerateNamedObjects("gbmv_k.c" "" "gbmv_n" false "" "" "" 3) GenerateNamedObjects("gbmv_k.c" "TRANS" "gbmv_t" false "" "" "" 3) +# c/zgbmv +GenerateNamedObjects("zgbmv_k.c" "CONJ" "gbmv_r" false "" "" "" 2) +GenerateNamedObjects("zgbmv_k.c" "TRANS;CONJ" "gbmv_c" false "" "" "" 2) +GenerateNamedObjects("zgbmv_k.c" "XCONJ" "gbmv_o" false "" "" "" 2) +GenerateNamedObjects("zgbmv_k.c" "TRANS;XCONJ" "gbmv_u" false "" "" "" 2) +GenerateNamedObjects("zgbmv_k.c" "CONJ;XCONJ" "gbmv_s" false "" "" "" 2) +GenerateNamedObjects("zgbmv_k.c" "TRANS;CONJ;XCONJ" "gbmv_d" false "" "" "" 2) # special defines for complex foreach (float_type ${FLOAT_TYPES}) @@ -82,6 +98,14 @@ foreach (float_type ${FLOAT_TYPES}) GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=4" 0 "${op_name}_CU" false ${float_type}) endforeach () + foreach (ulvm_source ${ULVM_COMPLEX_SOURCES}) + string(REGEX MATCH "[a-z0-9]+" op_name ${ulvm_source}) + GenerateNamedObjects("z${ulvm_source}" "" "${op_name}_U" false "" "" false ${float_type}) + GenerateNamedObjects("z${ulvm_source}" "LOWER" "${op_name}_L" false "" "" false ${float_type}) + GenerateNamedObjects("z${ulvm_source}" "HEMVREV" "${op_name}_V" false "" "" false ${float_type}) + GenerateNamedObjects("z${ulvm_source}" "LOWER;HEMVREV" "${op_name}_M" false "" "" false ${float_type}) + endforeach() + if (SMP) GenerateNamedObjects("gemv_thread.c" "CONJ" "gemv_thread_r" false "" "" false ${float_type}) @@ -103,6 +127,41 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("ger_thread.c" "XCONJ" "ger_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("ger_thread.c" "XCONJ;CONJ" "ger_thread_D" false "" "" false ${float_type}) + GenerateNamedObjects("sbmv_thread.c" "HEMV" "hbmv_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("sbmv_thread.c" "HEMV;LOWER" "hbmv_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("sbmv_thread.c" "HEMVREV" "hbmv_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("sbmv_thread.c" "LOWER;HEMVREV" "hbmv_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("spmv_thread.c" "HEMV" "hpmv_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("spmv_thread.c" "HEMV;LOWER" "hpmv_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("spmv_thread.c" "HEMVREV" "hpmv_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("spmv_thread.c" "LOWER;HEMVREV" "hpmv_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("spr_thread.c" "HEMV" "hpr_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("spr_thread.c" "HEMV;LOWER" "hpr_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("spr_thread.c" "HEMVREV" "hpr_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("spr_thread.c" "LOWER;HEMVREV" "hpr_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("spr2_thread.c" "HEMV" "hpr2_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("spr2_thread.c" "HEMV;LOWER" "hpr2_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("spr2_thread.c" "HEMVREV" "hpr2_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("spr2_thread.c" "LOWER;HEMVREV" "hpr2_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("symv_thread.c" "HEMV" "hemv_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("symv_thread.c" "HEMV;LOWER" "hemv_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("symv_thread.c" "HEMVREV" "hemv_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("symv_thread.c" "LOWER;HEMVREV" "hemv_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("syr_thread.c" "HER" "her_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("syr_thread.c" "HER;LOWER" "her_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("syr_thread.c" "HEMVREV" "her_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("syr_thread.c" "LOWER;HEMVREV" "her_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("syr2_thread.c" "HER2" "her2_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("syr2_thread.c" "HER2;LOWER" "her2_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("syr2_thread.c" "HEMVREV" "her2_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("syr2_thread.c" "LOWER;HEMVREV" "her2_thread_M" false "" "" false ${float_type}) + foreach (nu_smp_src ${NU_SMP_SOURCES}) string(REGEX MATCH "[a-z]+_[a-z]+" op_name ${nu_smp_src}) GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=1" 0 "${op_name}_N" false ${float_type}) diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index 9efe17092..ef9d58d76 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -64,7 +64,7 @@ static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #endif diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index 5b7fc7332..a0377d638 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -60,7 +60,7 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif a = (FLOAT *)args -> a; diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c index 93a2f44d4..0f47344df 100644 --- a/driver/level2/spmv_thread.c +++ b/driver/level2/spmv_thread.c @@ -60,7 +60,7 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif a = (FLOAT *)args -> a; diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index 3c1249448..bbb1c50eb 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -76,7 +76,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #endif diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c index 3b91cee45..47dc1daf9 100644 --- a/driver/level2/tpmv_thread.c +++ b/driver/level2/tpmv_thread.c @@ -81,7 +81,7 @@ static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #endif diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 29e9799f6..a9dc2dc62 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -87,7 +87,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #endif diff --git a/driver/level2/zgbmv_k.c b/driver/level2/zgbmv_k.c index 68d6045bd..d89932e33 100644 --- a/driver/level2/zgbmv_k.c +++ b/driver/level2/zgbmv_k.c @@ -77,7 +77,7 @@ void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha_r, FLOA FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; #ifdef TRANS - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif if (incy != 1) { diff --git a/driver/level2/zhbmv_k.c b/driver/level2/zhbmv_k.c index 70e92e050..33f70d2c5 100644 --- a/driver/level2/zhbmv_k.c +++ b/driver/level2/zhbmv_k.c @@ -56,6 +56,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *bufferX = sbmvbuffer; FLOAT temp[2]; + OPENBLAS_COMPLEX_FLOAT result; + if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); @@ -93,7 +95,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { - FLOAT _Complex result = DOTC_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + result = DOTC_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -118,7 +120,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { - FLOAT _Complex result = DOTC_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + result = DOTC_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -143,7 +145,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { - FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -168,7 +170,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { - FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); diff --git a/driver/level2/zhpmv_k.c b/driver/level2/zhpmv_k.c index 96bceaaf2..9e7ed7b0e 100644 --- a/driver/level2/zhpmv_k.c +++ b/driver/level2/zhpmv_k.c @@ -51,6 +51,8 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *bufferX = gemvbuffer; FLOAT temp[2]; + OPENBLAS_COMPLEX_FLOAT result; + if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); @@ -69,7 +71,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #ifndef HEMVREV #ifndef LOWER if (i > 0) { - FLOAT _Complex result = DOTC_K(i, a, 1, X, 1); + result = DOTC_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -93,7 +95,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #else if (m - i > 1) { - FLOAT _Complex result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); + result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -118,7 +120,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #else #ifndef LOWER if (i > 0) { - FLOAT _Complex result = DOTU_K(i, a, 1, X, 1); + result = DOTU_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -142,7 +144,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #else if (m - i > 1) { - FLOAT _Complex result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); + result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); diff --git a/driver/level2/zsbmv_k.c b/driver/level2/zsbmv_k.c index 30e2f91c3..3ae74ce80 100644 --- a/driver/level2/zsbmv_k.c +++ b/driver/level2/zsbmv_k.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *bufferY = sbmvbuffer; FLOAT *bufferX = sbmvbuffer; + OPENBLAS_COMPLEX_FLOAT result; + if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); @@ -83,7 +85,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); if (length > 0) { - FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -100,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, a, 1, Y + i * COMPSIZE, 1, NULL, 0); if (length > 0) { - FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); diff --git a/driver/level2/zspmv_k.c b/driver/level2/zspmv_k.c index 76657eab9..432205e83 100644 --- a/driver/level2/zspmv_k.c +++ b/driver/level2/zspmv_k.c @@ -49,7 +49,8 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; - FLOAT _Complex result; + + OPENBLAS_COMPLEX_FLOAT result; if (incy != 1) { Y = bufferY; diff --git a/driver/level2/ztbmv_L.c b/driver/level2/ztbmv_L.c index 74ff0bce1..1ac1cdef1 100644 --- a/driver/level2/ztbmv_L.c +++ b/driver/level2/ztbmv_L.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztbmv_U.c b/driver/level2/ztbmv_U.c index 933275de3..9aa203396 100644 --- a/driver/level2/ztbmv_U.c +++ b/driver/level2/ztbmv_U.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztbsv_L.c b/driver/level2/ztbsv_L.c index 0726bbd16..9aa701841 100644 --- a/driver/level2/ztbsv_L.c +++ b/driver/level2/ztbsv_L.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level2/ztbsv_U.c b/driver/level2/ztbsv_U.c index d022650bc..3722b1f71 100644 --- a/driver/level2/ztbsv_U.c +++ b/driver/level2/ztbsv_U.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level2/ztpmv_L.c b/driver/level2/ztpmv_L.c index 12c254c12..47e6df56c 100644 --- a/driver/level2/ztpmv_L.c +++ b/driver/level2/ztpmv_L.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztpmv_U.c b/driver/level2/ztpmv_U.c index 59708b8b8..da911fb4e 100644 --- a/driver/level2/ztpmv_U.c +++ b/driver/level2/ztpmv_U.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztpsv_L.c b/driver/level2/ztpsv_L.c index 3b8e562ce..a497e42a4 100644 --- a/driver/level2/ztpsv_L.c +++ b/driver/level2/ztpsv_L.c @@ -46,7 +46,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level2/ztpsv_U.c b/driver/level2/ztpsv_U.c index 601ac2f9d..28b824e3a 100644 --- a/driver/level2/ztpsv_U.c +++ b/driver/level2/ztpsv_U.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level2/ztrmv_L.c b/driver/level2/ztrmv_L.c index 63522cf81..92c86aec2 100644 --- a/driver/level2/ztrmv_L.c +++ b/driver/level2/ztrmv_L.c @@ -46,7 +46,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c index 8a4494fd7..f9671c9d6 100644 --- a/driver/level2/ztrmv_U.c +++ b/driver/level2/ztrmv_U.c @@ -46,7 +46,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztrsv_L.c b/driver/level2/ztrsv_L.c index 90f1c2c7d..dd3b2786e 100644 --- a/driver/level2/ztrsv_L.c +++ b/driver/level2/ztrsv_L.c @@ -46,7 +46,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level2/ztrsv_U.c b/driver/level2/ztrsv_U.c index bec8114f3..8803182a8 100644 --- a/driver/level2/ztrsv_U.c +++ b/driver/level2/ztrsv_U.c @@ -46,7 +46,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 376a0beeb..6d623b0c2 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -1,13 +1,5 @@ include_directories(${CMAKE_SOURCE_DIR}) -set(USE_GEMM3M 0) - -if (DEFINED ARCH) - if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64" OR ${ARCH} STREQUAL "ia64" OR ${ARCH} STREQUAL "MIPS") - set(USE_GEMM3M 1) - endif () -endif () - # N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa # loop through gemm.c defines @@ -54,12 +46,41 @@ foreach (float_type ${FLOAT_TYPES}) GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trsm_LC" false ${float_type}) GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trsm_RR" false ${float_type}) GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trsm_RC" false ${float_type}) + + #hemm + GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN" 0 "hemm_L" false ${float_type}) + GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE" 0 "hemm_R" false ${float_type}) + + #her2k + GenerateCombinationObjects("zher2k_kernel.c" "LOWER;CONJ" "U;N" "" 2 "her2k_kernel" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K" "her2k_UN" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;TRANS;CONJ" "her2k_UC" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER" "her2k_LN" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type}) + + if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) + #hemm + GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN;THREADED_LEVEL3" 0 "hemm_thread_L" false ${float_type}) + GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE;THREADED_LEVEL3" 0 "hemm_thread_R" false ${float_type}) + #her2k + GenerateNamedObjects("zher2k_k.c" "HER2K" "her2k_UN" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;TRANS;CONJ" "her2k_UC" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER" "her2k_LN" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type}) + endif() + # special gemm defines for complex foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) string(TOLOWER ${gemm_define} gemm_define_LC) GenerateNamedObjects("gemm.c" "${gemm_define}" "gemm_${gemm_define_LC}" false "" "" false ${float_type}) + if(USE_GEMM3M) + GenerateNamedObjects("gemm3m.c" "${gemm_define}" "gemm3m_${gemm_define_LC}" false "" "" false ${float_type}) + endif() if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) GenerateNamedObjects("gemm.c" "${gemm_define};THREADED_LEVEL3" "gemm_thread_${gemm_define_LC}" false "" "" false ${float_type}) + if(USE_GEMM3M) + GenerateNamedObjects("gemm3m.c" "${gemm_define};THREADED_LEVEL3" "gemm3m_thread_${gemm_define_LC}" false "" "" false ${float_type}) + endif() endif () endforeach () endif () diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 938f1daaf..b2af55e36 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -33,6 +33,8 @@ set(COMMON_SOURCES xerbla.c openblas_set_num_threads.c openblas_error_handle.c + openblas_get_num_procs.c + openblas_get_num_threads.c ) # these need to have NAME/CNAME set, so use GenerateNamedObjects, but don't use standard name mangling diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index ae949235b..91565d2f2 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -1,13 +1,16 @@ include_directories(${CMAKE_SOURCE_DIR}) + set(BLAS1_SOURCES copy.c - asum.c nrm2.c + nrm2.c ) set(BLAS1_REAL_ONLY_SOURCES rotm.c rotmg.c # N.B. these do not have complex counterparts + rot.c + asum.c ) # these will have 'z' prepended for the complex version @@ -15,7 +18,7 @@ set(BLAS1_MANGLED_SOURCES axpy.c swap.c scal.c dot.c - rot.c rotg.c + rotg.c axpby.c ) @@ -31,6 +34,13 @@ set(BLAS2_SOURCES tpsv.c tpmv.c ) +set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES + hemv.c hbmv.c + her.c her2.c + hpmv.c hpr.c + hpr2.c +) + # these do not have separate 'z' sources set(BLAS3_SOURCES gemm.c symm.c @@ -39,6 +49,7 @@ set(BLAS3_SOURCES set(BLAS3_MANGLED_SOURCES omatcopy.c imatcopy.c + geadd.c ) # generate the BLAS objs once with and once without cblas @@ -65,9 +76,14 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4) GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + #sdsdot, dsdot + GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") + GenerateNamedObjects("dsdot.c" "" "dsdot" ${CBLAS_FLAG} "" "" true "SINGLE") + # trmm is trsm with a compiler flag set GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) @@ -86,17 +102,36 @@ endforeach () # complex-specific sources foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("zger.c" "" "geru" false "" "" false ${float_type}) GenerateNamedObjects("zger.c" "CONJ" "gerc" false "" "" false ${float_type}) + GenerateNamedObjects("zdot.c" "CONJ" "dotc" false "" "" false ${float_type}) + GenerateNamedObjects("zdot.c" "" "dotu" false "" "" false ${float_type}) + + GenerateNamedObjects("symm.c" "HEMM" "hemm" false "" "" false ${float_type}) + GenerateNamedObjects("syrk.c" "HEMM" "herk" false "" "" false ${float_type}) + GenerateNamedObjects("syr2k.c" "HEMM" "her2k" false "" "" false ${float_type}) + + if (USE_GEMM3M) + GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type}) + endif() endif () if (${float_type} STREQUAL "COMPLEX") GenerateNamedObjects("zscal.c" "SSCAL" "sscal" false "" "" false "COMPLEX") GenerateNamedObjects("nrm2.c" "" "scnrm2" false "" "" true "COMPLEX") + GenerateNamedObjects("zrot.c" "" "csrot" false "" "" true "COMPLEX") + GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" false "" "" true "COMPLEX") + GenerateNamedObjects("max.c" "USE_ABS" "scamax" false "" "" true "COMPLEX") + GenerateNamedObjects("asum.c" "" "scasum" false "" "" true "COMPLEX") endif () if (${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("zscal.c" "SSCAL" "dscal" false "" "" false "ZCOMPLEX") GenerateNamedObjects("nrm2.c" "" "dznrm2" false "" "" true "ZCOMPLEX") + GenerateNamedObjects("zrot.c" "" "zdrot" false "" "" true "ZCOMPLEX") + GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" false "" "" true "ZCOMPLEX") + GenerateNamedObjects("max.c" "USE_ABS" "dzamax" false "" "" true "ZCOMPLEX") + GenerateNamedObjects("asum.c" "" "dzasum" false "" "" true "ZCOMPLEX") endif () endforeach () diff --git a/interface/rotg.c b/interface/rotg.c index 49088ab02..a0e6efdab 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -14,8 +14,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ #endif - -#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da = *DA; long double db = *DB; diff --git a/interface/zaxpby.c b/interface/zaxpby.c index 9e8324432..1abb24de9 100644 --- a/interface/zaxpby.c +++ b/interface/zaxpby.c @@ -53,13 +53,13 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT * #endif - if (n <= 0) return; - FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); + if (n <= 0) return; + FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx * 2; diff --git a/interface/zdot.c b/interface/zdot.c index 1380ce292..34dfb731a 100644 --- a/interface/zdot.c +++ b/interface/zdot.c @@ -57,21 +57,25 @@ #ifdef RETURN_BY_STRUCT MYTYPE NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { #elif defined RETURN_BY_STACK -void NAME(FLOAT _Complex *result, blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { +void NAME(OPENBLAS_COMPLEX_FLOAT *result, blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { #else -FLOAT _Complex NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { +OPENBLAS_COMPLEX_FLOAT NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { #endif BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; #ifndef RETURN_BY_STACK - FLOAT _Complex ret; + OPENBLAS_COMPLEX_FLOAT ret; #endif #ifdef RETURN_BY_STRUCT MYTYPE myret; #endif +#ifndef RETURN_BY_STRUCT + OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); +#endif + PRINT_DEBUG_NAME; if (n <= 0) { @@ -80,10 +84,10 @@ FLOAT _Complex NAME( blasint *N, FLOAT *x, blasint *INCX, myret.i = 0.; return myret; #elif defined RETURN_BY_STACK - *result = ZERO; + *result = zero; return; #else - return ZERO; + return zero; #endif } @@ -144,21 +148,21 @@ FLOAT _Complex NAME( blasint *N, FLOAT *x, blasint *INCX, #else #ifdef FORCE_USE_STACK -void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT _Complex *result){ +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, OPENBLAS_COMPLEX_FLOAT *result){ #else -FLOAT _Complex CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ +OPENBLAS_COMPLEX_FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ - FLOAT _Complex ret; + OPENBLAS_COMPLEX_FLOAT ret; #endif PRINT_DEBUG_CNAME; if (n <= 0) { #ifdef FORCE_USE_STACK - *result = ZERO; + *result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); return; #else - return ZERO; + return OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); #endif } diff --git a/interface/zgemv.c b/interface/zgemv.c index 704034aaf..792f799e5 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -79,6 +79,9 @@ void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *buffer; #ifdef SMP int nthreads; + int nthreads_max; + int nthreads_avail; + double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, @@ -91,14 +94,14 @@ void NAME(char *TRANS, blasint *M, blasint *N, blasint lenx, leny; blasint i; - PRINT_DEBUG_NAME; - FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); + PRINT_DEBUG_NAME; + TOUPPER(trans); info = 0; @@ -153,14 +156,14 @@ void CNAME(enum CBLAS_ORDER order, GEMV_O, GEMV_U, GEMV_S, GEMV_D, }; - PRINT_DEBUG_CNAME; - FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); + PRINT_DEBUG_CNAME; + trans = -1; info = 0; @@ -234,10 +237,10 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP - int nthreads_max = num_cpu_avail(2); - int nthreads_avail = nthreads_max; + nthreads_max = num_cpu_avail(2); + nthreads_avail = nthreads_max; - double MNK = (double) m * (double) n; + MNK = (double) m * (double) n; if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) )) nthreads_max = 1; diff --git a/interface/zrotg.c b/interface/zrotg.c index e9e8a11df..187343d41 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -6,13 +6,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ - PRINT_DEBUG_NAME; - - IDEBUG_START; - - FUNCTION_PROFILE_START(); - -#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da_r = *(DA + 0); long double da_i = *(DA + 1); @@ -22,6 +16,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ long double ada = fabs(da_r) + fabs(da_i); + PRINT_DEBUG_NAME; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + if (ada == ZERO) { *C = ZERO; *(S + 0) = ONE; @@ -54,6 +54,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ FLOAT ada = fabs(da_r) + fabs(da_i); FLOAT adb; + PRINT_DEBUG_NAME; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + if (ada == ZERO) { *C = ZERO; *(S + 0) = ONE; diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index cd71101a5..d2cc77b11 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -17,6 +17,7 @@ endif () SetDefaultL1() SetDefaultL2() +SetDefaultL3() ParseMakefileVars("${KERNELDIR}/KERNEL") ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") @@ -65,8 +66,20 @@ foreach (float_type ${FLOAT_TYPES}) else () GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "" "dot_k" false "" "" false ${float_type}) endif () + + if (${float_type} STREQUAL "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "srot_k" false "" "" false ${float_type}) + endif() + if (${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "drot_k" false "" "" false ${float_type}) + endif() + endforeach () +#dsdot,sdsdot +GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") +GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") + # Makefile.L2 GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) @@ -86,6 +99,12 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "XCONJ;TRANSA" "gemv_u" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "XCONJ;CONJ" "gemv_s" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "XCONJ;CONJ;TRANSA" "gemv_d" false "" "" false ${float_type}) + + GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_U_KERNEL}" "HEMV" "hemv_U" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_L_KERNEL}" "HEMV;LOWER" "hemv_L" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_V_KERNEL}" "HEMV;HEMVREV" "hemv_V" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_M_KERNEL}" "HEMV;HEMVREV;LOWER" "hemv_M" false "" "" false ${float_type}) + else () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "" "gemv_n" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false ${float_type}) @@ -93,14 +112,9 @@ foreach (float_type ${FLOAT_TYPES}) endforeach () # Makefile.L3 -set(USE_GEMM3M false) set(USE_TRMM false) -if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64" OR ${ARCH} STREQUAL "ia64" OR ${ARCH} STREQUAL "MIPS") - set(USE_GEMM3M true) -endif () - -if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC") +if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic") set(USE_TRMM true) endif () @@ -155,6 +169,13 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RR" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RC" false "" "" false ${float_type}) + + #hemm + GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type}) + else () GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) endif () @@ -241,11 +262,40 @@ foreach (float_type ${FLOAT_TYPES}) endif () endif () - GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CN}" "" "domatcopy_k_cn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RN}" "ROWM" "domatcopy_k_rn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CT}" "" "domatcopy_k_ct" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RT}" "ROWM" "domatcopy_k_rt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CN}" "" "omatcopy_k_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RN}" "ROWM" "omatcopy_k_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CT}" "" "omatcopy_k_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RT}" "ROWM" "omatcopy_k_rt" false "" "" false ${float_type}) + + if (NOT DEFINED ${float_char}OMATCOPY_CNC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_CNC ../arm/zomatcopy_cnc.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_RNC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_RNC ../arm/zomatcopy_rnc.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_CTC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_CTC ../arm/zomatcopy_ctc.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_RTC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_RTC ../arm/zomatcopy_rtc.c) + endif () + endif () + + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CNC}" "CONJ" "omatcopy_k_cnc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RNC}" "CONJ;ROWM" "omatcopy_k_rnc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CTC}" "CONJ" "omatcopy_k_ctc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RTC}" "CONJ;ROWM" "omatcopy_k_rtc" false "" "" false ${float_type}) + endif() + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () # Makefile.LA diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4ef351de3..60b8fb57f 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -3459,7 +3459,7 @@ ifndef DGEADD_K DGEADD_K = ../generic/geadd.c endif -$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) +$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEADD_K) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef CGEADD_K diff --git a/kernel/arm/zaxpby.c b/kernel/arm/zaxpby.c index 2e0c2940d..d9948349d 100644 --- a/kernel/arm/zaxpby.c +++ b/kernel/arm/zaxpby.c @@ -38,13 +38,16 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL BLASLONG ix,iy; FLOAT temp; + BLASLONG inc_x2; + BLASLONG inc_y2; + if ( n < 0 ) return(0); ix = 0; iy = 0; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; if ( beta_r == 0.0 && beta_i == 0.0) { diff --git a/kernel/arm/zaxpy.c b/kernel/arm/zaxpy.c index 929ee8b54..1dcaeac27 100644 --- a/kernel/arm/zaxpy.c +++ b/kernel/arm/zaxpy.c @@ -41,6 +41,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { BLASLONG i=0; BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; if ( n < 0 ) return(0); if ( da_r == 0.0 && da_i == 0.0 ) return(0); @@ -48,8 +50,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, ix = 0; iy = 0; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; while(i < n) { diff --git a/kernel/arm/zcopy.c b/kernel/arm/zcopy.c index f720d6ee5..07fe584c5 100644 --- a/kernel/arm/zcopy.c +++ b/kernel/arm/zcopy.c @@ -40,11 +40,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; + BLASLONG inc_x2; + BLASLONG inc_y2; if ( n < 0 ) return(0); - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; while(i < n) { diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 198104022..57f47e58e 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -40,24 +40,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #else -openblas_complex_double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #endif { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT dot[2]; - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; dot[0]=0.0; dot[1]=0.0; - __real__ result = 0.0 ; - __imag__ result = 0.0 ; + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; if ( n < 1 ) return(result); - BLASLONG inc_x2 = 2 * inc_x ; - BLASLONG inc_y2 = 2 * inc_y ; + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; while(i < n) { @@ -73,8 +75,8 @@ openblas_complex_double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BL i++ ; } - __real__ result = dot[0]; - __imag__ result = dot[1]; + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; return(result); } diff --git a/kernel/arm/zrot.c b/kernel/arm/zrot.c index 356a4df72..98be68db8 100644 --- a/kernel/arm/zrot.c +++ b/kernel/arm/zrot.c @@ -41,11 +41,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; if ( n <= 0 ) return(0); - BLASLONG inc_x2 = 2 * inc_x ; - BLASLONG inc_y2 = 2 * inc_y ; + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; while(i < n) { diff --git a/kernel/arm/zswap.c b/kernel/arm/zswap.c index fcfb38506..ae4760ae0 100644 --- a/kernel/arm/zswap.c +++ b/kernel/arm/zswap.c @@ -42,11 +42,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; if ( n < 0 ) return(0); - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; while(i < n) { diff --git a/kernel/x86_64/KERNEL.generic b/kernel/x86_64/KERNEL.generic index 672edb069..a23e59f3f 100644 --- a/kernel/x86_64/KERNEL.generic +++ b/kernel/x86_64/KERNEL.generic @@ -155,5 +155,11 @@ XSYMV_L_KERNEL = ../generic/zsymv_k.c ZHEMV_U_KERNEL = ../generic/zhemv_k.c ZHEMV_L_KERNEL = ../generic/zhemv_k.c +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/openblas_config_template.h b/openblas_config_template.h index 3b3435b0e..942a8f547 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -59,7 +59,8 @@ typedef int blasint; extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ - (__GNUC__ >= 3 && !defined(__cplusplus))) + (__GNUC__ >= 3 && !defined(__cplusplus)) || \ + _MSC_VER >= 1800) // Visual Studio 2013 supports complex #define OPENBLAS_COMPLEX_C99 #ifndef __cplusplus #include From f8eba3d548d48d10a39bdc8cce9ab59dba5cda69 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 11 Aug 2015 16:25:16 -0500 Subject: [PATCH 212/257] Fixed cmake build bugs on Linux. --- CMakeLists.txt | 21 ++++++++++++++++++--- cmake/c_check.cmake | 9 +++++++-- cmake/kernel.cmake | 4 ++-- cmake/lapacke.cmake | 2 +- common.h | 3 +++ common_x86_64.h | 4 ++++ ctest/CMakeLists.txt | 6 ++++++ driver/level2/CMakeLists.txt | 12 ++++++------ test/CMakeLists.txt | 31 +++++++++++++++++++++++++++++++ 9 files changed, 78 insertions(+), 14 deletions(-) create mode 100644 ctest/CMakeLists.txt create mode 100644 test/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 610cc9c90..e10df13a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,8 +15,10 @@ enable_language(C) set(OpenBLAS_LIBNAME openblas) ####### +if(MSVC) option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) -option(BUILD_WITHOUT_CBLAS "Without CBLAS" ON) +endif() +option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(BUILD_DEBUG "Build Debug Version" OFF) ####### if(BUILD_WITHOUT_LAPACK) @@ -129,15 +131,28 @@ set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") endif() # add objects to the openblas lib -add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${PROJECT_BINARY_DIR}/openblas.def) +add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) include("${CMAKE_SOURCE_DIR}/cmake/export.cmake") -#only build shared library for MSVC + if(NOT MSVC) +#only build shared library for MSVC add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS}) set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) + +if(SMP) +target_link_libraries(${OpenBLAS_LIBNAME} pthread) +target_link_libraries(${OpenBLAS_LIBNAME}_static pthread) +endif() + +#build test and ctest +enable_testing() +add_subdirectory(test) +if(NOT NO_CBLAS) +add_subdirectory(ctest) +endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index e32c18a43..89ec31446 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -25,9 +25,14 @@ # PTHREAD_CREATE_FUNC # N.B. c_check (and ctest.c) is not cross-platform, so instead try to use CMake variables. - -# TODO: detect FU (front underscore) by compiling ctest1.c +set(FU "") +if(APPLE) +set(FU "_") +elseif(MSVC) set(FU "_") +elseif(UNIX) +set(FU "") +endif() # Convert CMake vars into the format that OpenBLAS expects string(TOUPPER ${CMAKE_SYSTEM_NAME} HOST_OS) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index c2ee62545..fad84de51 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -100,8 +100,8 @@ macro(SetDefaultL1) set(XGEMVNKERNEL zgemv_n.S) set(XGEMVTKERNEL zgemv_t.S) set(SCABS_KERNEL ../generic/cabs.c) - set(DCABS_KERNEL ../generic/cabs.S) - set(QCABS_KERNEL ../generic/cabs.S) + set(DCABS_KERNEL ../generic/cabs.c) + set(QCABS_KERNEL ../generic/cabs.c) set(LSAME_KERNEL ../generic/lsame.c) set(SAXPBYKERNEL ../arm/axpby.c) set(DAXPBYKERNEL ../arm/axpby.c) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index ce7f781dd..39ade0577 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -2058,7 +2058,7 @@ endif () # add lapack-netlib folder to the sources set(LAPACKE_SOURCES "") foreach (LAE_FILE ${LAPACKE_REL_SRC}) - list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/lapacke/SRC/${LAE_FILE}") + list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/lapacke/src/${LAE_FILE}") endforeach () set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/lapacke/include") diff --git a/common.h b/common.h index 1fb2c7eaf..c87ed6652 100644 --- a/common.h +++ b/common.h @@ -460,6 +460,9 @@ typedef char* env_var_t; (__GNUC__ >= 3 && !defined(__cplusplus)) || \ _MSC_VER >= 1800) // Visual Studio 2013 supports complex #define OPENBLAS_COMPLEX_C99 +#ifndef __cplusplus + #include +#endif typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; typedef xdouble _Complex openblas_complex_xdouble; diff --git a/common_x86_64.h b/common_x86_64.h index 8bb87c7c0..54377695c 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -65,7 +65,11 @@ static void __inline blas_lock(volatile BLASULONG *address){ +#ifndef C_MSVC + int ret; +#else BLASULONG ret; +#endif do { while (*address) {YIELDING;}; diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt new file mode 100644 index 000000000..d3e15870b --- /dev/null +++ b/ctest/CMakeLists.txt @@ -0,0 +1,6 @@ +include_directories(${CMAKE_SOURCE_DIR}) + + +#foreach(test_bin ${OpenBLAS_Tests}) + +#endforeach() \ No newline at end of file diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 5db4fb5ee..3f40aa47d 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -154,13 +154,13 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("syr_thread.c" "HER" "her_thread_U" false "" "" false ${float_type}) GenerateNamedObjects("syr_thread.c" "HER;LOWER" "her_thread_L" false "" "" false ${float_type}) - GenerateNamedObjects("syr_thread.c" "HEMVREV" "her_thread_V" false "" "" false ${float_type}) - GenerateNamedObjects("syr_thread.c" "LOWER;HEMVREV" "her_thread_M" false "" "" false ${float_type}) + GenerateNamedObjects("syr_thread.c" "HERREV" "her_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("syr_thread.c" "LOWER;HERREV" "her_thread_M" false "" "" false ${float_type}) - GenerateNamedObjects("syr2_thread.c" "HER2" "her2_thread_U" false "" "" false ${float_type}) - GenerateNamedObjects("syr2_thread.c" "HER2;LOWER" "her2_thread_L" false "" "" false ${float_type}) - GenerateNamedObjects("syr2_thread.c" "HEMVREV" "her2_thread_V" false "" "" false ${float_type}) - GenerateNamedObjects("syr2_thread.c" "LOWER;HEMVREV" "her2_thread_M" false "" "" false ${float_type}) + GenerateNamedObjects("syr2_thread.c" "HER" "her2_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("syr2_thread.c" "HER;LOWER" "her2_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("syr2_thread.c" "HERREV" "her2_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("syr2_thread.c" "LOWER;HERREV" "her2_thread_M" false "" "" false ${float_type}) foreach (nu_smp_src ${NU_SMP_SOURCES}) string(REGEX MATCH "[a-z]+_[a-z]+" op_name ${nu_smp_src}) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 000000000..446fb8a44 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,31 @@ +include_directories(${CMAKE_SOURCE_DIR}) + +enable_language(Fortran) + +set(OpenBLAS_Tests + sblat1 sblat2 sblat3 + dblat1 dblat2 dblat3 + cblat1 cblat2 cblat3 + zblat1 zblat2 zblat3) + +foreach(test_bin ${OpenBLAS_Tests}) +add_executable(${test_bin} ${test_bin}.f) +target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}_static) +endforeach() + +# $1 exec, $2 input, $3 output_result +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh +"rm -f $3\n" +"$1 < $2" +) + +set(float_types s d c z) +foreach(float_type ${float_types}) +string(TOUPPER ${float_type} float_type_upper) +add_test(NAME "${float_type}blas1" + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat1") +add_test(NAME "${float_type}blas2" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) +add_test(NAME "${float_type}blas3" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) +endforeach() \ No newline at end of file From c2323dd4d2a65420f77c73f7b55c41ba469a47f8 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 15:18:42 +0200 Subject: [PATCH 213/257] really fix ARM locking - was writing 0 to lock variable, so was ineffective - only exit loop if both lock was 0 and strex was successful --- common_arm.h | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/common_arm.h b/common_arm.h index 135191057..2dabd4d7f 100644 --- a/common_arm.h +++ b/common_arm.h @@ -59,22 +59,16 @@ static void __inline blas_lock(volatile BLASULONG *address){ while (*address) {YIELDING;}; __asm__ __volatile__( - "1: \n\t" - "ldrex r2, [%1] \n\t" - "mov r2, #0 \n\t" - "strex r3, r2, [%1] \n\t" - "cmp r3, #0 \n\t" - "bne 1b \n\t" - "mov %0 , r3 \n\t" - : "=r"(ret), "=r"(address) - : "1"(address) - : "memory", "r2" , "r3" - - + "ldrex r2, [%1] \n\t" + "strex %0, %2, [%1] \n\t" + "orr %0, r2 \n\t" + : "=&r"(ret) + : "r"(address), "r"(1) + : "memory", "r2" ); } while (ret); - + MB; } From d3e2f0a1af73a6e74258294c911e7f4cb72d8ab5 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 15:37:02 +0200 Subject: [PATCH 214/257] add missing barriers should fix issue #597 --- driver/others/blas_server.c | 11 ++++++++++- driver/others/memory.c | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index b3b1ce7bd..1fd848c6b 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -425,6 +425,10 @@ static int blas_thread_server(void *arg){ main_status[cpu] = MAIN_FINISH; #endif + // arm: make sure all results are written out _before_ + // thread is marked as done and other threads use them + WMB; + thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ WMB; @@ -775,7 +779,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ stop = rpcc(); #endif - if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + if ((num > 1) && queue -> next) { + exec_blas_async_wait(num - 1, queue -> next); + + // arm: make sure results from other threads are visible + MB; + } #ifdef TIMING_DEBUG fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", diff --git a/driver/others/memory.c b/driver/others/memory.c index a562da377..49c57f911 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1153,6 +1153,9 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif + // arm: ensure all writes are finished before other thread takes this memory + WMB; + memory[position].used = 0; #ifdef DEBUG From e12cf1123e8784ce6fe9d2ac14526331fbe2c555 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 17:27:25 +0200 Subject: [PATCH 215/257] add fallback rpcc implementation - use on arm, arm64 and any new platform - use faster integer math instead of double - use similar scale as rdtsc so that timeouts work --- common.h | 28 ++++++++++++++++++++++++++++ common_alpha.h | 1 + common_arm.h | 10 ---------- common_arm64.h | 10 ---------- common_ia64.h | 2 ++ common_mips64.h | 1 + common_power.h | 1 + common_sparc.h | 1 + common_x86.h | 1 + common_x86_64.h | 1 + 10 files changed, 36 insertions(+), 20 deletions(-) diff --git a/common.h b/common.h index 320adadcb..5998b5608 100644 --- a/common.h +++ b/common.h @@ -410,7 +410,35 @@ typedef char env_var_t[MAX_PATH]; typedef char* env_var_t; #define readenv(p, n) ((p)=getenv(n)) #endif + +#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) +#ifdef _POSIX_MONOTONIC_CLOCK +#if defined(__GNUC_PREREQ) && __GLIBC_PREREQ(2, 17) // don't require -lrt +#define USE_MONOTONIC +#elif defined(OS_ANDROID) +#define USE_MONOTONIC +#endif +#endif +/* use similar scale as x86 rdtsc for timeouts to work correctly */ +static inline unsigned long long rpcc(void){ +#ifdef USE_MONOTONIC + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; +#else + struct timeval tv; + gettimeofday(&tv,NULL); + return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; +#endif +} +#define RPCC_DEFINED +#define RPCC64BIT +#endif // !RPCC_DEFINED + +#ifndef RPCC_DEFINED +#error "rpcc() implementation is missing for your platform" #endif +#endif // !ASSEMBLER #ifdef OS_LINUX #include "common_linux.h" diff --git a/common_alpha.h b/common_alpha.h index 845fb316a..86f58966a 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -89,6 +89,7 @@ static __inline unsigned int rpcc(void){ return r0; } +#define RPCC_DEFINED #define HALT ldq $0, 0($0) diff --git a/common_arm.h b/common_arm.h index 2dabd4d7f..7e0c02306 100644 --- a/common_arm.h +++ b/common_arm.h @@ -72,16 +72,6 @@ static void __inline blas_lock(volatile BLASULONG *address){ } -static inline unsigned long long rpcc(void){ - unsigned long long ret=0; - double v; - struct timeval tv; - gettimeofday(&tv,NULL); - v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; - ret = (unsigned long long) ( v * 1000.0d ); - return ret; -} - static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } diff --git a/common_arm64.h b/common_arm64.h index aa310c5f2..cc08fa75b 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -71,16 +71,6 @@ static void __inline blas_lock(volatile BLASULONG *address){ } -static inline unsigned long long rpcc(void){ - unsigned long long ret=0; - double v; - struct timeval tv; - gettimeofday(&tv,NULL); - v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; - ret = (unsigned long long) ( v * 1000.0d ); - return ret; -} - static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } diff --git a/common_ia64.h b/common_ia64.h index 8e92b5992..d1f210749 100644 --- a/common_ia64.h +++ b/common_ia64.h @@ -75,6 +75,7 @@ static __inline unsigned long rpcc(void) { __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks)); return clocks; } +#define RPCC_DEFINED static __inline unsigned long stmxcsr(void){ @@ -103,6 +104,7 @@ static __inline void blas_lock(volatile unsigned long *address){ static __inline unsigned int rpcc(void) { return __getReg(_IA64_REG_AR_ITC); } +#define RPCC_DEFINED static __inline unsigned int stmxcsr(void) { return __getReg(_IA64_REG_AR_FPSR); diff --git a/common_mips64.h b/common_mips64.h index 7cd86b375..bc1a52fb4 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -118,6 +118,7 @@ static inline unsigned int rpcc(void){ #endif return ret; } +#define RPCC_DEFINED #if defined(LOONGSON3A) || defined(LOONGSON3B) #ifndef NO_AFFINITY diff --git a/common_power.h b/common_power.h index e9b5cb630..3b9471a17 100644 --- a/common_power.h +++ b/common_power.h @@ -103,6 +103,7 @@ static inline unsigned long rpcc(void){ #endif } +#define RPCC_DEFINED #ifdef __64BIT__ #define RPCC64BIT diff --git a/common_sparc.h b/common_sparc.h index 87ef75276..8a16e3d3a 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -66,6 +66,7 @@ static __inline unsigned long rpcc(void){ return clocks; }; +#define RPCC_DEFINED #ifdef __64BIT__ #define RPCC64BIT diff --git a/common_x86.h b/common_x86.h index 99a723fd7..9506716ce 100644 --- a/common_x86.h +++ b/common_x86.h @@ -73,6 +73,7 @@ static __inline unsigned long long rpcc(void){ return ((unsigned long long)a + ((unsigned long long)d << 32)); }; +#define RPCC_DEFINED static __inline unsigned long getstackaddr(void){ unsigned long addr; diff --git a/common_x86_64.h b/common_x86_64.h index efb902416..3a02beefb 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -82,6 +82,7 @@ static __inline BLASULONG rpcc(void){ return ((BLASULONG)a + ((BLASULONG)d << 32)); } +#define RPCC_DEFINED #define RPCC64BIT From f2ac1a5cee9eebfaad33194e362fa2c05e2b05d9 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 18:08:45 +0200 Subject: [PATCH 216/257] set ARMV7 for Cortex-A9 and Cortex-A15 otherwise some macros like YIELDING are not defined correctly --- common_arm.h | 4 ++++ cpuid_arm.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/common_arm.h b/common_arm.h index 7e0c02306..74b6378dd 100644 --- a/common_arm.h +++ b/common_arm.h @@ -124,4 +124,8 @@ REALNAME: #define MAP_ANONYMOUS MAP_ANON #endif +#if !defined(ARMV5) && !defined(ARMV6) && !defined(ARMV7) && !defined(ARMV8) +#error "you must define ARMV5, ARMV6, ARMV7 or ARMV8" +#endif + #endif diff --git a/cpuid_arm.c b/cpuid_arm.c index 51ba72d70..6485003f3 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -192,6 +192,7 @@ void get_cpuconfig(void) { case CPU_CORTEXA9: printf("#define CORTEXA9\n"); + printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); @@ -207,6 +208,7 @@ void get_cpuconfig(void) case CPU_CORTEXA15: printf("#define CORTEXA15\n"); + printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); From 6b92204a7ce5faf8dab2301c59aa69a26f6b8a19 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 18:10:34 +0200 Subject: [PATCH 217/257] add fallback blas_lock implementation to be used on armv5 and new platforms --- common.h | 14 ++++++++++++++ common_alpha.h | 1 + common_arm.h | 4 ++++ common_arm64.h | 1 + common_ia64.h | 2 ++ common_mips64.h | 1 + common_power.h | 1 + common_sparc.h | 1 + common_x86.h | 1 + common_x86_64.h | 1 + 10 files changed, 27 insertions(+) diff --git a/common.h b/common.h index 5998b5608..6073f037f 100644 --- a/common.h +++ b/common.h @@ -435,9 +435,23 @@ static inline unsigned long long rpcc(void){ #define RPCC64BIT #endif // !RPCC_DEFINED +#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__) +static void __inline blas_lock(volatile BLASULONG *address){ + + do { + while (*address) {YIELDING;}; + + } while (!__sync_bool_compare_and_swap(address, 0, 1)); +} +#define BLAS_LOCK_DEFINED +#endif + #ifndef RPCC_DEFINED #error "rpcc() implementation is missing for your platform" #endif +#ifndef BLAS_LOCK_DEFINED +#error "blas_lock() implementation is missing for your platform" +#endif #endif // !ASSEMBLER #ifdef OS_LINUX diff --git a/common_alpha.h b/common_alpha.h index 86f58966a..9739c941d 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -76,6 +76,7 @@ static void __inline blas_lock(unsigned long *address){ "30:", address); #endif } +#define BLAS_LOCK_DEFINED static __inline unsigned int rpcc(void){ diff --git a/common_arm.h b/common_arm.h index 74b6378dd..84691d766 100644 --- a/common_arm.h +++ b/common_arm.h @@ -51,6 +51,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER +#if defined(ARMV6) || defined(ARMV7) || defined(ARMV8) + static void __inline blas_lock(volatile BLASULONG *address){ int register ret; @@ -71,6 +73,8 @@ static void __inline blas_lock(volatile BLASULONG *address){ MB; } +#define BLAS_LOCK_DEFINED +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; diff --git a/common_arm64.h b/common_arm64.h index cc08fa75b..c4e588d1f 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -69,6 +69,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static inline int blas_quickdivide(blasint x, blasint y){ diff --git a/common_ia64.h b/common_ia64.h index d1f210749..72b75fc4e 100644 --- a/common_ia64.h +++ b/common_ia64.h @@ -68,6 +68,7 @@ static __inline void blas_lock(volatile unsigned long *address){ : "ar.ccv", "memory"); } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long rpcc(void) { unsigned long clocks; @@ -100,6 +101,7 @@ static __inline void blas_lock(volatile unsigned long *address){ while (*address || _InterlockedCompareExchange((volatile int *) address,1,0)) ; } +#define BLAS_LOCK_DEFINED static __inline unsigned int rpcc(void) { return __getReg(_IA64_REG_AR_ITC); diff --git a/common_mips64.h b/common_mips64.h index bc1a52fb4..f5c0ec7cf 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -98,6 +98,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static inline unsigned int rpcc(void){ unsigned long ret; diff --git a/common_power.h b/common_power.h index 3b9471a17..ab331b04a 100644 --- a/common_power.h +++ b/common_power.h @@ -87,6 +87,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ #endif } while (ret); } +#define BLAS_LOCK_DEFINED static inline unsigned long rpcc(void){ unsigned long ret; diff --git a/common_sparc.h b/common_sparc.h index 8a16e3d3a..f99972db9 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -58,6 +58,7 @@ static void __inline blas_lock(volatile unsigned long *address){ : "memory"); } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long rpcc(void){ unsigned long clocks; diff --git a/common_x86.h b/common_x86.h index 9506716ce..6c90432a2 100644 --- a/common_x86.h +++ b/common_x86.h @@ -65,6 +65,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long long rpcc(void){ unsigned int a, d; diff --git a/common_x86_64.h b/common_x86_64.h index 3a02beefb..4c783b315 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -74,6 +74,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static __inline BLASULONG rpcc(void){ BLASULONG a, d; From d38a1ddc7a4ef8c10017ae5b81a447e322721b94 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 18:13:30 +0200 Subject: [PATCH 218/257] use real armv5 support there is no more requirement for ARMv6 instructions, and VFP on ARMv5 is uncommon --- Makefile.arm | 4 ++-- common_arm.h | 5 ++++- getarch.c | 3 +-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index 2f7b33730..272220ca9 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -26,8 +26,8 @@ endif ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 -FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +CCOMMON_OPT += -marm -march=armv5 +FCOMMON_OPT += -marm -march=armv5 endif diff --git a/common_arm.h b/common_arm.h index 84691d766..6bf836835 100644 --- a/common_arm.h +++ b/common_arm.h @@ -80,7 +80,10 @@ static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } -#if defined(DOUBLE) +#if !defined(HAVE_VFP) +/* no FPU, soft float */ +#define GET_IMAGE(res) +#elif defined(DOUBLE) #define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") diff --git a/getarch.c b/getarch.c index d56a37a7a..89e736a31 100644 --- a/getarch.c +++ b/getarch.c @@ -798,8 +798,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DARMV5 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ - "-DHAVE_VFP" + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " #define LIBNAME "armv5" #define CORENAME "ARMV5" #else From 3efeaed0d867c9d54701e9351de44e747cd21578 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 20:11:13 +0200 Subject: [PATCH 219/257] correct a minor mistake --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 6073f037f..a607c888b 100644 --- a/common.h +++ b/common.h @@ -413,7 +413,7 @@ typedef char* env_var_t; #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) #ifdef _POSIX_MONOTONIC_CLOCK -#if defined(__GNUC_PREREQ) && __GLIBC_PREREQ(2, 17) // don't require -lrt +#if defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 17) // don't require -lrt #define USE_MONOTONIC #elif defined(OS_ANDROID) #define USE_MONOTONIC From abade3f896634ebf6b9097469a05cb72d8fd7860 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Mon, 17 Aug 2015 01:27:45 +0200 Subject: [PATCH 220/257] really fix ARM64 locking --- common_arm64.h | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index c4e588d1f..15987c677 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -45,29 +45,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void __inline blas_lock(volatile BLASULONG *address){ - int register ret; - int register tmp; + long register ret; do { while (*address) {YIELDING;}; __asm__ __volatile__( - "1: \n\t" - "ldaxr %2, [%1] \n\t" - "mov %2, #0 \n\t" - "stlxr %w0, %2, [%1] \n\t" - "cbnz %w0, 1b \n\t" - "mov %0 , #0 \n\t" - : "=r"(ret), "=r"(address), "=r"(tmp) - : "1"(address) - : "memory", "%w0" - //, "%r2" , "%r3" - - + "ldaxr %0, [%1] \n\t" + "stlxr w2, %2, [%1] \n\t" + "orr %0, %0, x2 \n\t" + : "=r"(ret) + : "r"(address), "r"(1l) + : "memory", "x2" ); } while (ret); - + MB; } #define BLAS_LOCK_DEFINED From 17ee2237c382e8ea3f9d3a8aa74aef4a1d12ff17 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 18 Aug 2015 22:43:42 -0500 Subject: [PATCH 221/257] Fixed cmake bug with NO_LAPACK=1 --- cmake/f_check.cmake | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index e189b683a..e8fe4bfa7 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -27,7 +27,10 @@ if (MSVC) endif () if (NOT NO_LAPACK) -enable_language(Fortran) + enable_language(Fortran) +else() + include(CMakeForceCompiler) + CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) endif() if (NOT ONLY_CBLAS) From 7df08201606d903600d96c747c6ac070203a4d6f Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 19 Aug 2015 08:07:47 -0500 Subject: [PATCH 222/257] Use C kernels for s/dgemv on x86. --- kernel/x86/KERNEL | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL index 8b30355ec..39be2ef80 100644 --- a/kernel/x86/KERNEL +++ b/kernel/x86/KERNEL @@ -98,6 +98,23 @@ ifndef XAXPYKERNEL XAXPYKERNEL = xaxpy.S endif +#Use C kernel for sgemv and dgemv +ifndef SGEMVNKERNEL +SGEMVNKERNEL = ../arm/gemv_n.c +endif + +ifndef SGEMVTKERNEL +SGEMVTKERNEL = ../arm/gemv_t.c +endif + +ifndef DGEMVNKERNEL +DGEMVNKERNEL = ../arm/gemv_n.c +endif + +ifndef DGEMVTKERNEL +DGEMVTKERNEL = ../arm/gemv_t.c +endif + ifndef QGEMVNKERNEL QGEMVNKERNEL = qgemv_n.S endif From 50901943fde1fb09cc4149c007873d4e21d424c5 Mon Sep 17 00:00:00 2001 From: The Gitter Badger Date: Thu, 20 Aug 2015 03:21:09 +0000 Subject: [PATCH 223/257] Added Gitter badge --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index cdacf9888..16f874078 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # OpenBLAS +[![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) + [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) ## Introduction From 5408074941d5cc0f4aad180562cafcf4cf27a56d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 19 Aug 2015 22:50:25 -0500 Subject: [PATCH 224/257] Add notification. --- .travis.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.travis.yml b/.travis.yml index 7d625c9dc..806cb0046 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,13 @@ language: c + +notifications: + webhooks: + urls: + - https://webhooks.gitter.im/e/8a6e4470a0cebd090344 + on_success: change # options: [always|never|change] default: always + on_failure: always # options: [always|never|change] default: always + on_start: never # options: [always|never|change] default: always + compiler: - gcc From 2297a2d9893667d3222139013de7398680ec6f1a Mon Sep 17 00:00:00 2001 From: buffer51 Date: Thu, 3 Sep 2015 20:54:21 -0400 Subject: [PATCH 225/257] Fixed error in common.h for Android compilation introduced by e12cf1123e8784ce6fe9d2ac14526331fbe2c555 --- common.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common.h b/common.h index a607c888b..c367e38cb 100644 --- a/common.h +++ b/common.h @@ -413,8 +413,10 @@ typedef char* env_var_t; #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) #ifdef _POSIX_MONOTONIC_CLOCK -#if defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 17) // don't require -lrt +#if defined(__GLIBC_PREREQ) // cut the if condition if two lines, otherwise will fail at __GLIBC_PREREQ(2, 17) +#if __GLIBC_PREREQ(2, 17) // don't require -lrt #define USE_MONOTONIC +#endif #elif defined(OS_ANDROID) #define USE_MONOTONIC #endif From 711ca33bc6da03daf2115c7a82ae2a56f73d67a3 Mon Sep 17 00:00:00 2001 From: Martin Koehler Date: Mon, 7 Sep 2015 14:33:26 +0200 Subject: [PATCH 226/257] Improved Ximatcopy when lda==ldb. The Ximatcopy functions create a copy of the input matrix although they seem to work inplace. The new routines XIMATCOPY_K_YY perform the operations inplace if the leading dimension does not change. --- CONTRIBUTORS.md | 3 + common_c.h | 19 ++++ common_d.h | 9 ++ common_level3.h | 24 +++++ common_macro.h | 26 +++++ common_param.h | 30 ++++++ common_s.h | 8 ++ common_z.h | 18 ++++ interface/imatcopy.c | 35 ++++++- interface/zimatcopy.c | 50 ++++++++- kernel/Makefile.L3 | 181 +++++++++++++++++++++++++++++++++ kernel/generic/imatcopy_cn.c | 67 ++++++++++++ kernel/generic/imatcopy_ct.c | 91 +++++++++++++++++ kernel/generic/imatcopy_rn.c | 72 +++++++++++++ kernel/generic/imatcopy_rt.c | 64 ++++++++++++ kernel/generic/zimatcopy_cn.c | 67 ++++++++++++ kernel/generic/zimatcopy_cnc.c | 67 ++++++++++++ kernel/generic/zimatcopy_ct.c | 82 +++++++++++++++ kernel/generic/zimatcopy_ctc.c | 85 ++++++++++++++++ kernel/generic/zimatcopy_rn.c | 66 ++++++++++++ kernel/generic/zimatcopy_rnc.c | 65 ++++++++++++ kernel/generic/zimatcopy_rt.c | 80 +++++++++++++++ kernel/generic/zimatcopy_rtc.c | 82 +++++++++++++++ 23 files changed, 1288 insertions(+), 3 deletions(-) create mode 100644 kernel/generic/imatcopy_cn.c create mode 100644 kernel/generic/imatcopy_ct.c create mode 100644 kernel/generic/imatcopy_rn.c create mode 100644 kernel/generic/imatcopy_rt.c create mode 100644 kernel/generic/zimatcopy_cn.c create mode 100644 kernel/generic/zimatcopy_cnc.c create mode 100644 kernel/generic/zimatcopy_ct.c create mode 100644 kernel/generic/zimatcopy_ctc.c create mode 100644 kernel/generic/zimatcopy_rn.c create mode 100644 kernel/generic/zimatcopy_rnc.c create mode 100644 kernel/generic/zimatcopy_rt.c create mode 100644 kernel/generic/zimatcopy_rtc.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index b88e3671b..88e461dc4 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -127,5 +127,8 @@ In chronological order: * Ton van den Heuvel * [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity(). +* Martin Koehler + * [2015-09-07] Improved imatcopy + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/common_c.h b/common_c.h index 741d7d087..ce0f2a5bd 100644 --- a/common_c.h +++ b/common_c.h @@ -220,6 +220,15 @@ #define COMATCOPY_K_CTC comatcopy_k_ctc #define COMATCOPY_K_RTC comatcopy_k_rtc +#define CIMATCOPY_K_CN cimatcopy_k_cn +#define CIMATCOPY_K_RN cimatcopy_k_rn +#define CIMATCOPY_K_CT cimatcopy_k_ct +#define CIMATCOPY_K_RT cimatcopy_k_rt +#define CIMATCOPY_K_CNC cimatcopy_k_cnc +#define CIMATCOPY_K_RNC cimatcopy_k_rnc +#define CIMATCOPY_K_CTC cimatcopy_k_ctc +#define CIMATCOPY_K_RTC cimatcopy_k_rtc + #define CGEADD_K cgeadd_k #else @@ -403,6 +412,16 @@ #define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc #define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc #define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc + +#define CIMATCOPY_K_CN gotoblas -> cimatcopy_k_cn +#define CIMATCOPY_K_RN gotoblas -> cimatcopy_k_rn +#define CIMATCOPY_K_CT gotoblas -> cimatcopy_k_ct +#define CIMATCOPY_K_RT gotoblas -> cimatcopy_k_rt +#define CIMATCOPY_K_CNC gotoblas -> cimatcopy_k_cnc +#define CIMATCOPY_K_RNC gotoblas -> cimatcopy_k_rnc +#define CIMATCOPY_K_CTC gotoblas -> cimatcopy_k_ctc +#define CIMATCOPY_K_RTC gotoblas -> cimatcopy_k_rtc + #define CGEADD_K gotoblas -> cgeadd_k #endif diff --git a/common_d.h b/common_d.h index d6dfd7f04..ad9945186 100644 --- a/common_d.h +++ b/common_d.h @@ -149,6 +149,11 @@ #define DOMATCOPY_K_RN domatcopy_k_rn #define DOMATCOPY_K_CT domatcopy_k_ct #define DOMATCOPY_K_RT domatcopy_k_rt + +#define DIMATCOPY_K_CN dimatcopy_k_cn +#define DIMATCOPY_K_RN dimatcopy_k_rn +#define DIMATCOPY_K_CT dimatcopy_k_ct +#define DIMATCOPY_K_RT dimatcopy_k_rt #define DGEADD_K dgeadd_k #else @@ -267,6 +272,10 @@ #define DOMATCOPY_K_RN gotoblas -> domatcopy_k_rn #define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct #define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt +#define DIMATCOPY_K_CN gotoblas -> dimatcopy_k_cn +#define DIMATCOPY_K_RN gotoblas -> dimatcopy_k_rn +#define DIMATCOPY_K_CT gotoblas -> dimatcopy_k_ct +#define DIMATCOPY_K_RT gotoblas -> dimatcopy_k_rt #define DGEADD_K gotoblas -> dgeadd_k diff --git a/common_level3.h b/common_level3.h index e0ecbc4e2..1f5490baa 100644 --- a/common_level3.h +++ b/common_level3.h @@ -1736,31 +1736,55 @@ int somatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLAS int somatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); +int simatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG); int domatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); +int dimatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG); int comatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cimatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int comatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cimatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int zomatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zimatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zomatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zimatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG); int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 8555baa67..4976e766f 100644 --- a/common_macro.h +++ b/common_macro.h @@ -634,6 +634,11 @@ #define OMATCOPY_K_RN DOMATCOPY_K_RN #define OMATCOPY_K_CT DOMATCOPY_K_CT #define OMATCOPY_K_RT DOMATCOPY_K_RT +#define IMATCOPY_K_CN DIMATCOPY_K_CN +#define IMATCOPY_K_RN DIMATCOPY_K_RN +#define IMATCOPY_K_CT DIMATCOPY_K_CT +#define IMATCOPY_K_RT DIMATCOPY_K_RT + #define GEADD_K DGEADD_K #else @@ -931,6 +936,10 @@ #define OMATCOPY_K_RN SOMATCOPY_K_RN #define OMATCOPY_K_CT SOMATCOPY_K_CT #define OMATCOPY_K_RT SOMATCOPY_K_RT +#define IMATCOPY_K_CN SIMATCOPY_K_CN +#define IMATCOPY_K_RN SIMATCOPY_K_RN +#define IMATCOPY_K_CT SIMATCOPY_K_CT +#define IMATCOPY_K_RT SIMATCOPY_K_RT #define GEADD_K SGEADD_K #endif @@ -1747,6 +1756,15 @@ #define OMATCOPY_K_RNC ZOMATCOPY_K_RNC #define OMATCOPY_K_CTC ZOMATCOPY_K_CTC #define OMATCOPY_K_RTC ZOMATCOPY_K_RTC +#define IMATCOPY_K_CN ZIMATCOPY_K_CN +#define IMATCOPY_K_RN ZIMATCOPY_K_RN +#define IMATCOPY_K_CT ZIMATCOPY_K_CT +#define IMATCOPY_K_RT ZIMATCOPY_K_RT +#define IMATCOPY_K_CNC ZIMATCOPY_K_CNC +#define IMATCOPY_K_RNC ZIMATCOPY_K_RNC +#define IMATCOPY_K_CTC ZIMATCOPY_K_CTC +#define IMATCOPY_K_RTC ZIMATCOPY_K_RTC + #define GEADD_K ZGEADD_K #else @@ -2160,6 +2178,14 @@ #define OMATCOPY_K_RNC COMATCOPY_K_RNC #define OMATCOPY_K_CTC COMATCOPY_K_CTC #define OMATCOPY_K_RTC COMATCOPY_K_RTC +#define IMATCOPY_K_CN CIMATCOPY_K_CN +#define IMATCOPY_K_RN CIMATCOPY_K_RN +#define IMATCOPY_K_CT CIMATCOPY_K_CT +#define IMATCOPY_K_RT CIMATCOPY_K_RT +#define IMATCOPY_K_CNC CIMATCOPY_K_CNC +#define IMATCOPY_K_RNC CIMATCOPY_K_RNC +#define IMATCOPY_K_CTC CIMATCOPY_K_CTC +#define IMATCOPY_K_RTC CIMATCOPY_K_RTC #define GEADD_K CGEADD_K diff --git a/common_param.h b/common_param.h index 1b56e85f0..ab40ddeef 100644 --- a/common_param.h +++ b/common_param.h @@ -830,31 +830,61 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); + int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); + int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); diff --git a/common_s.h b/common_s.h index a4d8679b7..3c1600859 100644 --- a/common_s.h +++ b/common_s.h @@ -152,6 +152,10 @@ #define SOMATCOPY_K_RN somatcopy_k_rn #define SOMATCOPY_K_CT somatcopy_k_ct #define SOMATCOPY_K_RT somatcopy_k_rt +#define SIMATCOPY_K_CN simatcopy_k_cn +#define SIMATCOPY_K_RN simatcopy_k_rn +#define SIMATCOPY_K_CT simatcopy_k_ct +#define SIMATCOPY_K_RT simatcopy_k_rt #define SGEADD_K sgeadd_k @@ -274,6 +278,10 @@ #define SOMATCOPY_K_RN gotoblas -> somatcopy_k_rn #define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct #define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt +#define SIMATCOPY_K_CN gotoblas -> simatcopy_k_cn +#define SIMATCOPY_K_RN gotoblas -> simatcopy_k_rn +#define SIMATCOPY_K_CT gotoblas -> simatcopy_k_ct +#define SIMATCOPY_K_RT gotoblas -> simatcopy_k_rt #define SGEADD_K gotoblas -> sgeadd_k diff --git a/common_z.h b/common_z.h index b17122776..b4f58bb0c 100644 --- a/common_z.h +++ b/common_z.h @@ -220,6 +220,15 @@ #define ZOMATCOPY_K_CTC zomatcopy_k_ctc #define ZOMATCOPY_K_RTC zomatcopy_k_rtc +#define ZIMATCOPY_K_CN zimatcopy_k_cn +#define ZIMATCOPY_K_RN zimatcopy_k_rn +#define ZIMATCOPY_K_CT zimatcopy_k_ct +#define ZIMATCOPY_K_RT zimatcopy_k_rt +#define ZIMATCOPY_K_CNC zimatcopy_k_cnc +#define ZIMATCOPY_K_RNC zimatcopy_k_rnc +#define ZIMATCOPY_K_CTC zimatcopy_k_ctc +#define ZIMATCOPY_K_RTC zimatcopy_k_rtc + #define ZGEADD_K zgeadd_k #else @@ -404,6 +413,15 @@ #define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc #define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc +#define ZIMATCOPY_K_CN gotoblas -> zimatcopy_k_cn +#define ZIMATCOPY_K_RN gotoblas -> zimatcopy_k_rn +#define ZIMATCOPY_K_CT gotoblas -> zimatcopy_k_ct +#define ZIMATCOPY_K_RT gotoblas -> zimatcopy_k_rt +#define ZIMATCOPY_K_CNC gotoblas -> zimatcopy_k_cnc +#define ZIMATCOPY_K_RNC gotoblas -> zimatcopy_k_rnc +#define ZIMATCOPY_K_CTC gotoblas -> zimatcopy_k_ctc +#define ZIMATCOPY_K_RTC gotoblas -> zimatcopy_k_rtc + #define ZGEADD_K gotoblas -> zgeadd_k #endif diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 89f0ec823..f4309a85c 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -26,7 +26,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** - * 2014/06/10 Saar + * 2014-06-10 Saar + * 2015-09-07 grisuthedragon ***********************************************************/ #include @@ -50,6 +51,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef malloc #undef free +/* Enables the New IMATCOPY code with inplace operation if lda == ldb */ +#define NEW_IMATCOPY + #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) { @@ -75,7 +79,6 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, #else void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, blasint cldb) { - char Order, Trans; int order=-1,trans=-1; blasint info = -1; FLOAT *b; @@ -117,6 +120,34 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } +#ifdef NEW_IMATCOPY + if ( *lda == *ldb ) { + if ( order == BlasColMajor ) + { + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda ); + } + else + { + IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda ); + } + } + else + { + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda ); + } + else + { + IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda ); + } + } + return; + } + +#endif if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT); diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 3f273cf13..798bff585 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -26,7 +26,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** - * 2014/06/10 Saar + * 2014-06-10 Saar + * 2015-09-07 grisuthedragon ***********************************************************/ #include @@ -49,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BlasTransConj 2 #define BlasConj 3 +#define NEW_IMATCOPY #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) @@ -124,6 +126,52 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, return; } +#ifdef NEW_IMATCOPY + if (*lda == *ldb) { + if ( order == BlasColMajor ) + { + + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasConj ) + { + IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTrans ) + { + IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTransConj ) + { + IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + } + else + { + + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasConj ) + { + IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTrans ) + { + IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTransConj ) + { + IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + } + return(0); + } +#endif + if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2; else diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4ef351de3..7da4bcb92 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -334,11 +334,15 @@ endif SBLASOBJS += \ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ sgeadd_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ dgeadd_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ @@ -346,6 +350,10 @@ CBLASOBJS += \ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ cgeadd_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ @@ -353,6 +361,10 @@ ZBLASOBJS += \ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) @@ -3305,6 +3317,34 @@ endif $(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ +ifndef DIMATCOPY_CN +DIMATCOPY_CN = ../generic/imatcopy_cn.c +endif + +$(KDIR)dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DIMATCOPY_RN +DIMATCOPY_RN = ../generic/imatcopy_rn.c +endif + +$(KDIR)dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef DIMATCOPY_CT +DIMATCOPY_CT = ../generic/imatcopy_ct.c +endif + +$(KDIR)dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DIMATCOPY_RT +DIMATCOPY_RT = ../generic/imatcopy_rt.c +endif + +$(KDIR)dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + ifndef SOMATCOPY_CN SOMATCOPY_CN = ../arm/omatcopy_cn.c endif @@ -3333,6 +3373,34 @@ endif $(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ +ifndef SIMATCOPY_CN +SIMATCOPY_CN = ../generic/imatcopy_cn.c +endif + +$(KDIR)simatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SIMATCOPY_RN +SIMATCOPY_RN = ../generic/imatcopy_rn.c +endif + +$(KDIR)simatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef SIMATCOPY_CT +SIMATCOPY_CT = ../generic/imatcopy_ct.c +endif + +$(KDIR)simatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SIMATCOPY_RT +SIMATCOPY_RT = ../generic/imatcopy_rt.c +endif + +$(KDIR)simatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + ifndef COMATCOPY_CN COMATCOPY_CN = ../arm/zomatcopy_cn.c @@ -3390,6 +3458,63 @@ endif $(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ +ifndef CIMATCOPY_CN +CIMATCOPY_CN = ../generic/zimatcopy_cn.c +endif + +$(KDIR)cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_RN +CIMATCOPY_RN = ../generic/zimatcopy_rn.c +endif + +$(KDIR)cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_CT +CIMATCOPY_CT = ../generic/zimatcopy_ct.c +endif + +$(KDIR)cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_RT +CIMATCOPY_RT = ../generic/zimatcopy_rt.c +endif + +$(KDIR)cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_CNC +CIMATCOPY_CNC = ../generic/zimatcopy_cnc.c +endif + +$(KDIR)cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_RNC +CIMATCOPY_RNC = ../generic/zimatcopy_rnc.c +endif + +$(KDIR)cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_CTC +CIMATCOPY_CTC = ../generic/zimatcopy_ctc.c +endif + +$(KDIR)cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_RTC +CIMATCOPY_RTC = ../generic/zimatcopy_rtc.c +endif + +$(KDIR)cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + + ifndef ZOMATCOPY_CN ZOMATCOPY_CN = ../arm/zomatcopy_cn.c @@ -3447,6 +3572,62 @@ endif $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ +ifndef ZIMATCOPY_CN +ZIMATCOPY_CN = ../generic/zimatcopy_cn.c +endif + +$(KDIR)zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_RN +ZIMATCOPY_RN = ../generic/zimatcopy_rn.c +endif + +$(KDIR)zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_CT +ZIMATCOPY_CT = ../generic/zimatcopy_ct.c +endif + +$(KDIR)zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_RT +ZIMATCOPY_RT = ../generic/zimatcopy_rt.c +endif + +$(KDIR)zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_CNC +ZIMATCOPY_CNC = ../generic/zimatcopy_cnc.c +endif + +$(KDIR)zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_RNC +ZIMATCOPY_RNC = ../generic/zimatcopy_rnc.c +endif + +$(KDIR)zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_CTC +ZIMATCOPY_CTC = ../generic/zimatcopy_ctc.c +endif + +$(KDIR)zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_RTC +ZIMATCOPY_RTC = ../generic/zimatcopy_rtc.c +endif + +$(KDIR)zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + ifndef SGEADD_K SGEADD_K = ../generic/geadd.c diff --git a/kernel/generic/imatcopy_cn.c b/kernel/generic/imatcopy_cn.c new file mode 100644 index 000000000..e63bc976c --- /dev/null +++ b/kernel/generic/imatcopy_cn.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +/***************************************************** + * 2015-09-07 grisuthedragon +******************************************************/ + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda) +{ + BLASLONG i,j; + FLOAT *aptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + if ( alpha == 1.0 ) return(0); + + aptr = a; + if ( alpha == 0.0 ) + { + for ( i=0; i Date: Wed, 9 Sep 2015 09:25:48 +0930 Subject: [PATCH 227/257] Fix lantr preparation for row major matrices --- lapack-netlib/lapacke/src/lapacke_clantr.c | 2 +- lapack-netlib/lapacke/src/lapacke_clantr_work.c | 6 +++--- lapack-netlib/lapacke/src/lapacke_dlantr.c | 2 +- lapack-netlib/lapacke/src/lapacke_dlantr_work.c | 6 +++--- lapack-netlib/lapacke/src/lapacke_slantr.c | 2 +- lapack-netlib/lapacke/src/lapacke_slantr_work.c | 6 +++--- lapack-netlib/lapacke/src/lapacke_zlantr.c | 2 +- lapack-netlib/lapacke/src/lapacke_zlantr_work.c | 6 +++--- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/lapack-netlib/lapacke/src/lapacke_clantr.c b/lapack-netlib/lapacke/src/lapacke_clantr.c index 77743f2d5..00ba34273 100644 --- a/lapack-netlib/lapacke/src/lapacke_clantr.c +++ b/lapack-netlib/lapacke/src/lapacke_clantr.c @@ -53,7 +53,7 @@ float LAPACKE_clantr( int matrix_order, char norm, char uplo, char diag, /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, '0' ) ) { - work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/lapacke/src/lapacke_clantr_work.c b/lapack-netlib/lapacke/src/lapacke_clantr_work.c index cb253a11e..1fa8cd923 100644 --- a/lapack-netlib/lapacke/src/lapacke_clantr_work.c +++ b/lapack-netlib/lapacke/src/lapacke_clantr_work.c @@ -47,7 +47,7 @@ float LAPACKE_clantr_work( int matrix_order, char norm, char uplo, info = info - 1; } } else if( matrix_order == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,n); + lapack_int lda_t = MAX(1,m); lapack_complex_float* a_t = NULL; /* Check leading dimension(s) */ if( lda < n ) { @@ -57,13 +57,13 @@ float LAPACKE_clantr_work( int matrix_order, char norm, char uplo, } /* Allocate memory for temporary array(s) */ a_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,MAX(m,n)) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_ctr_trans( matrix_order, uplo, diag, n, a, lda, a_t, lda_t ); + LAPACKE_ctr_trans( matrix_order, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); info = 0; /* LAPACK call is ok! */ diff --git a/lapack-netlib/lapacke/src/lapacke_dlantr.c b/lapack-netlib/lapacke/src/lapacke_dlantr.c index 522122cb2..2cde1ebad 100644 --- a/lapack-netlib/lapacke/src/lapacke_dlantr.c +++ b/lapack-netlib/lapacke/src/lapacke_dlantr.c @@ -53,7 +53,7 @@ double LAPACKE_dlantr( int matrix_order, char norm, char uplo, char diag, /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, '0' ) ) { - work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/lapacke/src/lapacke_dlantr_work.c b/lapack-netlib/lapacke/src/lapacke_dlantr_work.c index 0a937bda9..44d638fa5 100644 --- a/lapack-netlib/lapacke/src/lapacke_dlantr_work.c +++ b/lapack-netlib/lapacke/src/lapacke_dlantr_work.c @@ -46,7 +46,7 @@ double LAPACKE_dlantr_work( int matrix_order, char norm, char uplo, info = info - 1; } } else if( matrix_order == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,n); + lapack_int lda_t = MAX(1,m); double* a_t = NULL; /* Check leading dimension(s) */ if( lda < n ) { @@ -55,13 +55,13 @@ double LAPACKE_dlantr_work( int matrix_order, char norm, char uplo, return info; } /* Allocate memory for temporary array(s) */ - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,MAX(m,n)) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_dtr_trans( matrix_order, uplo, diag, n, a, lda, a_t, lda_t ); + LAPACKE_dtr_trans( matrix_order, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); info = 0; /* LAPACK call is ok! */ diff --git a/lapack-netlib/lapacke/src/lapacke_slantr.c b/lapack-netlib/lapacke/src/lapacke_slantr.c index d6a512027..80313d118 100644 --- a/lapack-netlib/lapacke/src/lapacke_slantr.c +++ b/lapack-netlib/lapacke/src/lapacke_slantr.c @@ -53,7 +53,7 @@ float LAPACKE_slantr( int matrix_order, char norm, char uplo, char diag, /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, '0' ) ) { - work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/lapacke/src/lapacke_slantr_work.c b/lapack-netlib/lapacke/src/lapacke_slantr_work.c index 2389468d0..9032f7094 100644 --- a/lapack-netlib/lapacke/src/lapacke_slantr_work.c +++ b/lapack-netlib/lapacke/src/lapacke_slantr_work.c @@ -46,7 +46,7 @@ float LAPACKE_slantr_work( int matrix_order, char norm, char uplo, info = info - 1; } } else if( matrix_order == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,n); + lapack_int lda_t = MAX(1,m); float* a_t = NULL; /* Check leading dimension(s) */ if( lda < n ) { @@ -55,13 +55,13 @@ float LAPACKE_slantr_work( int matrix_order, char norm, char uplo, return info; } /* Allocate memory for temporary array(s) */ - a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MAX(m,n)) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_str_trans( matrix_order, uplo, diag, n, a, lda, a_t, lda_t ); + LAPACKE_str_trans( matrix_order, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); info = 0; /* LAPACK call is ok! */ diff --git a/lapack-netlib/lapacke/src/lapacke_zlantr.c b/lapack-netlib/lapacke/src/lapacke_zlantr.c index 887bc2eea..001ce68f6 100644 --- a/lapack-netlib/lapacke/src/lapacke_zlantr.c +++ b/lapack-netlib/lapacke/src/lapacke_zlantr.c @@ -53,7 +53,7 @@ double LAPACKE_zlantr( int matrix_order, char norm, char uplo, char diag, /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, '0' ) ) { - work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/lapacke/src/lapacke_zlantr_work.c b/lapack-netlib/lapacke/src/lapacke_zlantr_work.c index 65e741428..8700a6ee2 100644 --- a/lapack-netlib/lapacke/src/lapacke_zlantr_work.c +++ b/lapack-netlib/lapacke/src/lapacke_zlantr_work.c @@ -47,7 +47,7 @@ double LAPACKE_zlantr_work( int matrix_order, char norm, char uplo, info = info - 1; } } else if( matrix_order == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,n); + lapack_int lda_t = MAX(1,m); lapack_complex_double* a_t = NULL; /* Check leading dimension(s) */ if( lda < n ) { @@ -57,13 +57,13 @@ double LAPACKE_zlantr_work( int matrix_order, char norm, char uplo, } /* Allocate memory for temporary array(s) */ a_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,MAX(m,n)) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_ztr_trans( matrix_order, uplo, diag, n, a, lda, a_t, lda_t ); + LAPACKE_ztr_trans( matrix_order, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); info = 0; /* LAPACK call is ok! */ From 61ae47eb9926a869942267b3dc1b62a139e36ebe Mon Sep 17 00:00:00 2001 From: Yichao Yu Date: Wed, 9 Sep 2015 11:00:23 -0400 Subject: [PATCH 228/257] Ref #632. Support Intel Skylake by Haswell kernels. --- cpuid_x86.c | 20 ++++++++++++++++++++ driver/others/dynamic.c | 9 +++++++++ 2 files changed, 29 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 4f97cfb5a..828ecc328 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1147,6 +1147,16 @@ int get_cpuname(void){ return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + case 14: + // Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; @@ -1622,6 +1632,16 @@ int get_coretype(void){ return CORE_HASWELL; #else return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + case 14: + // Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index ff80504f9..1f70b798c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -263,6 +263,15 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + //Intel Skylake + if (model == 14) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: From cc7cab8a45d031e7e0e78147a863a632d584ed9d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 9 Sep 2015 10:47:17 -0500 Subject: [PATCH 229/257] Detect other Intel Skylake cores. http://users.atw.hu/instlatx64/ --- cpuid_x86.c | 22 ++++++++++++++++++++++ driver/others/dynamic.c | 11 ++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 828ecc328..135ac7cf9 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1133,6 +1133,16 @@ int get_cpuname(void){ return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + case 14: + //Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; @@ -1150,6 +1160,7 @@ int get_cpuname(void){ #endif else return CPUTYPE_NEHALEM; + case 5: case 14: // Skylake if(support_avx()) @@ -1618,6 +1629,16 @@ int get_coretype(void){ return CORE_HASWELL; #else return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + case 14: + //Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; @@ -1635,6 +1656,7 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; + case 5: case 14: // Skylake if(support_avx()) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1f70b798c..c41164559 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -252,6 +252,15 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + //Intel Skylake + if (model == 14) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; case 5: //Intel Broadwell @@ -264,7 +273,7 @@ static gotoblas_t *get_coretype(void){ } } //Intel Skylake - if (model == 14) { + if (model == 14 || model == 5) { if(support_avx()) return &gotoblas_HASWELL; else{ From d6e8459f201ec0e95da31d9886f413f9fd10a034 Mon Sep 17 00:00:00 2001 From: kortschak Date: Thu, 10 Sep 2015 15:32:50 +0930 Subject: [PATCH 230/257] Fix LAPACK_*lansy routines Fixes #639. --- lapack-netlib/lapacke/src/lapacke_clansy.c | 4 ++-- lapack-netlib/lapacke/src/lapacke_dlansy.c | 4 ++-- lapack-netlib/lapacke/src/lapacke_slansy.c | 4 ++-- lapack-netlib/lapacke/src/lapacke_zlansy.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/lapacke/src/lapacke_clansy.c b/lapack-netlib/lapacke/src/lapacke_clansy.c index 84a9d965a..eb9951145 100644 --- a/lapack-netlib/lapacke/src/lapacke_clansy.c +++ b/lapack-netlib/lapacke/src/lapacke_clansy.c @@ -51,7 +51,7 @@ float LAPACKE_clansy( int matrix_order, char norm, char uplo, lapack_int n, #endif /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -62,7 +62,7 @@ float LAPACKE_clansy( int matrix_order, char norm, char uplo, lapack_int n, res = LAPACKE_clansy_work( matrix_order, norm, uplo, n, a, lda, work ); /* Release memory and exit */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/lapacke/src/lapacke_dlansy.c b/lapack-netlib/lapacke/src/lapacke_dlansy.c index 5e6721ef8..3d9964202 100644 --- a/lapack-netlib/lapacke/src/lapacke_dlansy.c +++ b/lapack-netlib/lapacke/src/lapacke_dlansy.c @@ -51,7 +51,7 @@ double LAPACKE_dlansy( int matrix_order, char norm, char uplo, lapack_int n, #endif /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -62,7 +62,7 @@ double LAPACKE_dlansy( int matrix_order, char norm, char uplo, lapack_int n, res = LAPACKE_dlansy_work( matrix_order, norm, uplo, n, a, lda, work ); /* Release memory and exit */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/lapacke/src/lapacke_slansy.c b/lapack-netlib/lapacke/src/lapacke_slansy.c index 105ce4635..adad99b7d 100644 --- a/lapack-netlib/lapacke/src/lapacke_slansy.c +++ b/lapack-netlib/lapacke/src/lapacke_slansy.c @@ -51,7 +51,7 @@ float LAPACKE_slansy( int matrix_order, char norm, char uplo, lapack_int n, #endif /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -62,7 +62,7 @@ float LAPACKE_slansy( int matrix_order, char norm, char uplo, lapack_int n, res = LAPACKE_slansy_work( matrix_order, norm, uplo, n, a, lda, work ); /* Release memory and exit */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/lapacke/src/lapacke_zlansy.c b/lapack-netlib/lapacke/src/lapacke_zlansy.c index 891437846..460a51a85 100644 --- a/lapack-netlib/lapacke/src/lapacke_zlansy.c +++ b/lapack-netlib/lapacke/src/lapacke_zlansy.c @@ -51,7 +51,7 @@ double LAPACKE_zlansy( int matrix_order, char norm, char uplo, lapack_int n, #endif /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -62,7 +62,7 @@ double LAPACKE_zlansy( int matrix_order, char norm, char uplo, lapack_int n, res = LAPACKE_zlansy_work( matrix_order, norm, uplo, n, a, lda, work ); /* Release memory and exit */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { LAPACKE_free( work ); } exit_level_0: From baec8f5cacfb2be6e1a73d4abfbb0eaf32d8d44a Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 10 Sep 2015 10:32:07 -0500 Subject: [PATCH 231/257] Refs #638. Fixed compiling bug with clang on Mac OS X. --- interface/zimatcopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 798bff585..b1e1d15dc 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -168,7 +168,7 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); } } - return(0); + return; } #endif From 45c8b5e7567926872fd5ef69a73f5bd5e51efa39 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Tue, 22 Sep 2015 12:00:30 +0200 Subject: [PATCH 232/257] actually remove cblas_noconst.h This file hasn't been used since 212463dce961827421a9c54f109a430c1599732c. --- cblas_noconst.h | 350 ------------------------------------------------ 1 file changed, 350 deletions(-) delete mode 100644 cblas_noconst.h diff --git a/cblas_noconst.h b/cblas_noconst.h deleted file mode 100644 index 4451c304e..000000000 --- a/cblas_noconst.h +++ /dev/null @@ -1,350 +0,0 @@ -#ifndef CBLAS_H -#define CBLAS_H - -#include -#include "common.h" - -#ifdef __cplusplus -extern "C" { - /* Assume C declarations for C++ */ -#endif /* __cplusplus */ - -/*Set the number of threads on runtime.*/ -void openblas_set_num_threads(int num_threads); -void goto_set_num_threads(int num_threads); - -/*Get the number of threads on runtime.*/ -int openblas_get_num_threads(void); - -/*Get the number of physical processors (cores).*/ -int openblas_get_num_procs(void); - -/*Get the build configure on runtime.*/ -char* openblas_get_config(void); - -/* Get the parallelization type which is used by OpenBLAS */ -int openblas_get_parallel(void); -/* OpenBLAS is compiled for sequential use */ -#define OPENBLAS_SEQUENTIAL 0 -/* OpenBLAS is compiled using normal threading model */ -#define OPENBLAS_THREAD 1 -/* OpenBLAS is compiled using OpenMP threading model */ -#define OPENBLAS_OPENMP 2 - - -#define CBLAS_INDEX size_t - -typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; -typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; -typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; -typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; -typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; - -float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); -double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); -float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); -double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); - -openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); -openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); -void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); - -float cblas_sasum (blasint n, float *x, blasint incx); -double cblas_dasum (blasint n, double *x, blasint incx); -float cblas_scasum(blasint n, float *x, blasint incx); -double cblas_dzasum(blasint n, double *x, blasint incx); - -float cblas_snrm2 (blasint N, float *X, blasint incX); -double cblas_dnrm2 (blasint N, double *X, blasint incX); -float cblas_scnrm2(blasint N, float *X, blasint incX); -double cblas_dznrm2(blasint N, double *X, blasint incX); - -CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); -CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); - -void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); -void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy); -void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy); -void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy); - -void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); -void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); - -void cblas_srotg(float *a, float *b, float *c, float *s); -void cblas_drotg(double *a, double *b, double *c, double *s); - -void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); -void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); - -void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); -void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); - -void cblas_sscal(blasint N, float alpha, float *X, blasint incX); -void cblas_dscal(blasint N, double alpha, double *X, blasint incX); -void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); -void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); -void cblas_csscal(blasint N, float alpha, float *X, blasint incX); -void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); - -void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); -void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); -void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); -void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); - -void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); - -void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); - -void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); - -void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); -void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); - -void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, - blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, - blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, - float *Y, blasint incY, float *A, blasint lda); -void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, - double *Y, blasint incY, double *A, blasint lda); - -void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); - - -void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); - -void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); - -void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); - -void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); - -void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, - blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, - blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - - -void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, - float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, - double *X, blasint incX, double beta, double *Y, blasint incY); - -void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); -void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); - -void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); -void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); - -void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); -void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); -void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); -void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); - -void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); -void cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); -void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); -void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); - -void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); - -void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); - -void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); - -void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); - -void cblas_xerbla(blasint p, char *rout, char *form, ...); - -/*** BLAS extensions ***/ - -void cblas_saxpby(blasint n, float alpha, float *x, blasint incx,float beta, float *y, blasint incy); - -void cblas_daxpby(blasint n, double alpha, double *x, blasint incx,double beta, double *y, blasint incy); - -void cblas_caxpby(blasint n, float *alpha, float *x, blasint incx,float *beta, float *y, blasint incy); - -void cblas_zaxpby(blasint n, double *alpha, double *x, blasint incx,double *beta, double *y, blasint incy); - -void cblas_somatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a, - blasint clda, float *b, blasint cldb); -void cblas_domatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a, - blasint clda, double *b, blasint cldb); -void cblas_comatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a, - blasint clda, void *b, blasint cldb); -void cblas_zomatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a, - blasint clda, void *b, blasint cldb); - -void cblas_simatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a, - blasint clda, blasint cldb); -void cblas_dimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a, - blasint clda, blasint cldb); -void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float* calpha, float* a, - blasint clda, blasint cldb); -void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a, - blasint clda, blasint cldb); - -void cblas_sgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float calpha, float *a, blasint clda, float cbeta, - float *c, blasint cldc); -void cblas_dgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double calpha, double *a, blasint clda, double cbeta, - double *c, blasint cldc); -void cblas_cgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float *calpha, float *a, blasint clda, float *cbeta, - float *c, blasint cldc); -void cblas_zgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double *calpha, double *a, blasint clda, double *cbeta, - double *c, blasint cldc); - -#ifdef __cplusplus -} -#endif /* __cplusplus */ - -#endif From b9534bbd76fd09c63432f4a05c46de4c8d563614 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Tue, 22 Sep 2015 12:01:09 +0200 Subject: [PATCH 233/257] git ignore versioned .so files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7422cead3..8df228993 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ lapack-netlib/make.inc lapack-netlib/lapacke/include/lapacke_mangling.h lapack-netlib/TESTING/testing_results.txt *.so +*.so.* *.a .svn *~ From f27942a68aced9933761c5d608dfb45e8fd10e8a Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 26 Sep 2015 14:42:44 +0000 Subject: [PATCH 234/257] Fixed make TARGET=CORTEXA9 and CORTEXA15 bug. --- getarch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/getarch.c b/getarch.c index 89e736a31..c05b90410 100644 --- a/getarch.c +++ b/getarch.c @@ -750,7 +750,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "CORTEXA9" #define SUBDIRNAME "arm" -#define ARCHCONFIG "-DCORTEXA9 " \ +#define ARCHCONFIG "-DCORTEXA9 -DARMV7 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ @@ -765,7 +765,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "CORTEXA15" #define SUBDIRNAME "arm" -#define ARCHCONFIG "-DCORTEXA15 " \ +#define ARCHCONFIG "-DCORTEXA15 -DARMV7 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ From 88bef3bffcfd1715bb26cc410a7d60163dcfd4a1 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 1 Oct 2015 15:07:04 -0400 Subject: [PATCH 235/257] default to lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX), as discussed in #646: if you rename the symbols, it is best to rename the library --- Makefile.rule | 3 +++ Makefile.system | 12 ++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 19f3fe3d9..22f222e3f 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -169,6 +169,9 @@ COMMON_PROF = -pg # 64 bit integer interfaces in OpenBLAS. # For details, https://github.com/xianyi/OpenBLAS/pull/459 # +# The same prefix and suffix are also added to the library name, +# i.e. you get lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) rather than libopenblas +# # SYMBOLPREFIX= # SYMBOLSUFFIX= diff --git a/Makefile.system b/Makefile.system index 325ee6af9..42ad49849 100644 --- a/Makefile.system +++ b/Makefile.system @@ -880,12 +880,6 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif -ifndef LIBNAMESUFFIX -LIBPREFIX = libopenblas -else -LIBPREFIX = libopenblas_$(LIBNAMESUFFIX) -endif - ifndef SYMBOLPREFIX SYMBOLPREFIX = endif @@ -894,6 +888,12 @@ ifndef SYMBOLSUFFIX SYMBOLSUFFIX = endif +ifndef LIBNAMESUFFIX +LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) +else +LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) +endif + KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) From e9493f69ebc706a974b1650cded21b70115668de Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Mon, 5 Oct 2015 00:58:07 -0400 Subject: [PATCH 236/257] Fix cross compilation suffix detection If the path involves `-`, this would have otherwise detected this as a cross compile suffix. --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 0fdadb659..d694e7411 100644 --- a/c_check +++ b/c_check @@ -30,7 +30,7 @@ if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { $cross_suffix = $1; } } else { - if ($ARGV[0] =~ /(.*-)(.*)/) { + if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) { $cross_suffix = $1; } } From 11ac4665c835a27a097e5021074cbf366bcb9765 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 5 Oct 2015 14:14:32 -0500 Subject: [PATCH 237/257] Fixed #654. Make sure the gotoblas_init function is run before all other static initializations. --- driver/others/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 49c57f911..f75a47d65 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -139,8 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) #ifdef DYNAMIC_ARCH gotoblas_t *gotoblas = NULL; From 90aa8e24b94ce8bbf73e60f9c69c50a2b18565da Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 7 Oct 2015 02:31:51 +0800 Subject: [PATCH 238/257] Refs #615. Import bug fixes for LAPACKE dormlq. --- lapack-netlib/lapacke/src/lapacke_cunmlq_work.c | 16 +++++++++------- lapack-netlib/lapacke/src/lapacke_dormlq_work.c | 16 +++++++++------- lapack-netlib/lapacke/src/lapacke_sormlq_work.c | 16 +++++++++------- lapack-netlib/lapacke/src/lapacke_zunmlq_work.c | 16 +++++++++------- 4 files changed, 36 insertions(+), 28 deletions(-) diff --git a/lapack-netlib/lapacke/src/lapacke_cunmlq_work.c b/lapack-netlib/lapacke/src/lapacke_cunmlq_work.c index 1cd20e1ca..5cf66424d 100644 --- a/lapack-netlib/lapacke/src/lapacke_cunmlq_work.c +++ b/lapack-netlib/lapacke/src/lapacke_cunmlq_work.c @@ -1,5 +1,5 @@ /***************************************************************************** - Copyright (c) 2011, Intel Corp. + Copyright (c) 2014, Intel Corp. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,7 +33,7 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_cunmlq_work( int matrix_order, char side, char trans, +lapack_int LAPACKE_cunmlq_work( int matrix_layout, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau, @@ -41,20 +41,22 @@ lapack_int LAPACKE_cunmlq_work( int matrix_order, char side, char trans, lapack_complex_float* work, lapack_int lwork ) { lapack_int info = 0; - if( matrix_order == LAPACK_COL_MAJOR ) { + lapack_int r; + if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_cunmlq( &side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } - } else if( matrix_order == LAPACK_ROW_MAJOR ) { + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + r = LAPACKE_lsame( side, 'l' ) ? m : n; lapack_int lda_t = MAX(1,k); lapack_int ldc_t = MAX(1,m); lapack_complex_float* a_t = NULL; lapack_complex_float* c_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < r ) { info = -8; LAPACKE_xerbla( "LAPACKE_cunmlq_work", info ); return info; @@ -84,8 +86,8 @@ lapack_int LAPACKE_cunmlq_work( int matrix_order, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - LAPACKE_cge_trans( matrix_order, k, m, a, lda, a_t, lda_t ); - LAPACKE_cge_trans( matrix_order, m, n, c, ldc, c_t, ldc_t ); + LAPACKE_cge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_cunmlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, work, &lwork, &info ); diff --git a/lapack-netlib/lapacke/src/lapacke_dormlq_work.c b/lapack-netlib/lapacke/src/lapacke_dormlq_work.c index 9a7a997fe..99a7c3c71 100644 --- a/lapack-netlib/lapacke/src/lapacke_dormlq_work.c +++ b/lapack-netlib/lapacke/src/lapacke_dormlq_work.c @@ -1,5 +1,5 @@ /***************************************************************************** - Copyright (c) 2011, Intel Corp. + Copyright (c) 2014, Intel Corp. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,27 +33,29 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_dormlq_work( int matrix_order, char side, char trans, +lapack_int LAPACKE_dormlq_work( int matrix_layout, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc, double* work, lapack_int lwork ) { lapack_int info = 0; + lapack_int r; lapack_int lda_t, ldc_t; double *a_t = NULL, *c_t = NULL; - if( matrix_order == LAPACK_COL_MAJOR ) { + if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dormlq( &side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } - } else if( matrix_order == LAPACK_ROW_MAJOR ) { + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + r = LAPACKE_lsame( side, 'l' ) ? m : n; lda_t = MAX(1,k); ldc_t = MAX(1,m); /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < r ) { info = -8; LAPACKE_xerbla( "LAPACKE_dormlq_work", info ); return info; @@ -81,8 +83,8 @@ lapack_int LAPACKE_dormlq_work( int matrix_order, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - LAPACKE_dge_trans( matrix_order, k, m, a, lda, a_t, lda_t ); - LAPACKE_dge_trans( matrix_order, m, n, c, ldc, c_t, ldc_t ); + LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_dormlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, work, &lwork, &info ); diff --git a/lapack-netlib/lapacke/src/lapacke_sormlq_work.c b/lapack-netlib/lapacke/src/lapacke_sormlq_work.c index 7a7464d18..bbf55bd84 100644 --- a/lapack-netlib/lapacke/src/lapacke_sormlq_work.c +++ b/lapack-netlib/lapacke/src/lapacke_sormlq_work.c @@ -1,5 +1,5 @@ /***************************************************************************** - Copyright (c) 2011, Intel Corp. + Copyright (c) 2014, Intel Corp. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,27 +33,29 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_sormlq_work( int matrix_order, char side, char trans, +lapack_int LAPACKE_sormlq_work( int matrix_layout, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work, lapack_int lwork ) { lapack_int info = 0; + lapack_int r; lapack_int lda_t, ldc_t; float *a_t = NULL, *c_t = NULL; - if( matrix_order == LAPACK_COL_MAJOR ) { + if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_sormlq( &side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } - } else if( matrix_order == LAPACK_ROW_MAJOR ) { + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + r = LAPACKE_lsame( side, 'l' ) ? m : n; lda_t = MAX(1,k); ldc_t = MAX(1,m); /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < r ) { info = -8; LAPACKE_xerbla( "LAPACKE_sormlq_work", info ); return info; @@ -81,8 +83,8 @@ lapack_int LAPACKE_sormlq_work( int matrix_order, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - LAPACKE_sge_trans( matrix_order, k, m, a, lda, a_t, lda_t ); - LAPACKE_sge_trans( matrix_order, m, n, c, ldc, c_t, ldc_t ); + LAPACKE_sge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_sormlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, work, &lwork, &info ); diff --git a/lapack-netlib/lapacke/src/lapacke_zunmlq_work.c b/lapack-netlib/lapacke/src/lapacke_zunmlq_work.c index 8677ac0bc..38a2d947a 100644 --- a/lapack-netlib/lapacke/src/lapacke_zunmlq_work.c +++ b/lapack-netlib/lapacke/src/lapacke_zunmlq_work.c @@ -1,5 +1,5 @@ /***************************************************************************** - Copyright (c) 2011, Intel Corp. + Copyright (c) 2014, Intel Corp. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,7 +33,7 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_zunmlq_work( int matrix_order, char side, char trans, +lapack_int LAPACKE_zunmlq_work( int matrix_layout, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau, @@ -41,20 +41,22 @@ lapack_int LAPACKE_zunmlq_work( int matrix_order, char side, char trans, lapack_complex_double* work, lapack_int lwork ) { lapack_int info = 0; - if( matrix_order == LAPACK_COL_MAJOR ) { + lapack_int r; + if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zunmlq( &side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } - } else if( matrix_order == LAPACK_ROW_MAJOR ) { + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + r = LAPACKE_lsame( side, 'l' ) ? m : n; lapack_int lda_t = MAX(1,k); lapack_int ldc_t = MAX(1,m); lapack_complex_double* a_t = NULL; lapack_complex_double* c_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < r ) { info = -8; LAPACKE_xerbla( "LAPACKE_zunmlq_work", info ); return info; @@ -84,8 +86,8 @@ lapack_int LAPACKE_zunmlq_work( int matrix_order, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - LAPACKE_zge_trans( matrix_order, k, m, a, lda, a_t, lda_t ); - LAPACKE_zge_trans( matrix_order, m, n, c, ldc, c_t, ldc_t ); + LAPACKE_zge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_zunmlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, work, &lwork, &info ); From 3684706a121f9d9e1ccfc4a2bbb98f698eb04514 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 8 Oct 2015 15:07:24 +0000 Subject: [PATCH 239/257] Include time.h. --- common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common.h b/common.h index c367e38cb..c7660a7dd 100644 --- a/common.h +++ b/common.h @@ -114,6 +114,7 @@ extern "C" { #include #endif #include +#include #include #include #ifdef SMP From aca7d7e953712703b1571b05d47b7c2cd515d6f9 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 20 Oct 2015 03:35:25 +0800 Subject: [PATCH 240/257] Detect cmake test result. --- test/CMakeLists.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 446fb8a44..cd4497117 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -16,7 +16,14 @@ endforeach() # $1 exec, $2 input, $3 output_result FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "rm -f $3\n" -"$1 < $2" +"$1 < $2\n" +"grep -q FATAL $3\n" +"if [ $? -eq 0 ]; then\n" +"echo Error\n" +"exit 1\n" +"else\n" +"exit 0\n" +"fi\n" ) set(float_types s d c z) From d8392c1245f87661c66f6e05d8c8091927630a4d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 20 Oct 2015 04:30:55 +0800 Subject: [PATCH 241/257] Fixe cmake config bugs. --- driver/level2/CMakeLists.txt | 21 +++++- driver/level3/CMakeLists.txt | 29 ++++++--- kernel/CMakeLists.txt | 120 +++++++++++++++++++++++++---------- 3 files changed, 125 insertions(+), 45 deletions(-) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 3f40aa47d..696767486 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -172,11 +172,26 @@ foreach (float_type ${FLOAT_TYPES}) endif () else () - # N.B. BLAS wants to put the U/L from the filename in the *MIDDLE* - GenerateCombinationObjects("${U_SOURCES};${L_SOURCES}" "TRANSA;UNIT" "N;N" "" 3 "" false ${float_type}) + # For real number functions + foreach (u_source ${U_SOURCES}) + string(REGEX MATCH "[a-z]+" op_name ${u_source}) + GenerateCombinationObjects("${u_source}" "UNIT" "N" "" 0 "${op_name}_NU" false ${float_type}) + GenerateCombinationObjects("${u_source}" "UNIT" "N" "TRANSA" 0 "${op_name}_TL" false ${float_type}) + endforeach () + + foreach (l_source ${L_SOURCES}) + string(REGEX MATCH "[a-z]+" op_name ${l_source}) + GenerateCombinationObjects("${l_source}" "UNIT" "N" "" 0 "${op_name}_NL" false ${float_type}) + GenerateCombinationObjects("${l_source}" "UNIT" "N" "TRANSA" 0 "${op_name}_TU" false ${float_type}) + endforeach () + if (SMP) GenerateNamedObjects("ger_thread.c" "" "" false "" "" false ${float_type}) - GenerateCombinationObjects("${NU_SMP_SOURCES}" "TRANSA;LOWER;UNIT" "N;U;N" "" 2 "" false ${float_type}) + foreach(nu_smp_source ${NU_SMP_SOURCES}) + string(REGEX MATCH "[a-z]+_[a-z]+" op_name ${nu_smp_source}) + GenerateCombinationObjects("${nu_smp_source}" "LOWER;UNIT" "U;N" "" 0 "${op_name}_N" false ${float_type}) + GenerateCombinationObjects("${nu_smp_source}" "LOWER;UNIT" "U;N" "TRANSA" 0 "${op_name}_T" false ${float_type}) + endforeach() endif () endif () endforeach () diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 6d623b0c2..41d440f7a 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -13,11 +13,24 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) endif () endforeach () -GenerateCombinationObjects("trmm_L.c;trmm_R.c;trsm_L.c;trsm_R.c" "TRANS;UPPER;UNIT" "N;L;N" "" 0) -GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "NN" 1) -GenerateCombinationObjects("syrk_k.c;syr2k_k.c" "LOWER;TRANS" "U;N" "" 1) -GenerateCombinationObjects("syrk_kernel.c;syr2k_kernel.c" "LOWER" "U" "" 2) +set(TRMM_TRSM_SOURCES + trmm_L.c + trmm_R.c + trsm_L.c + trsm_R.c) + +foreach(trmm_trsm_source ${TRMM_TRSM_SOURCES}) + string(REGEX MATCH "[a-z]+_[A-Z]+" op_name ${trmm_trsm_source}) + GenerateCombinationObjects("${trmm_trsm_source}" "UPPER;UNIT" "L;N" "" 0 "${op_name}N") + GenerateCombinationObjects("${trmm_trsm_source}" "UPPER;UNIT" "L;N" "TRANSA" 0 "${op_name}T") +endforeach() + +GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "NN" 1) +GenerateCombinationObjects("syrk_k.c" "LOWER;TRANS" "U;N" "" 1) +GenerateCombinationObjects("syr2k_k.c" "LOWER;TRANS" "U;N" "" 1) +GenerateCombinationObjects("syrk_kernel.c" "LOWER" "U" "" 2) +GenerateCombinationObjects("syr2k_kernel.c" "LOWER" "U" "" 2) if (SMP) # N.B. these do NOT have a float type (e.g. DOUBLE) defined! @@ -39,13 +52,13 @@ foreach (float_type ${FLOAT_TYPES}) GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type}) # Need to set CONJ for trmm and trsm GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_LR" false ${float_type}) - GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trmm_LC" false ${float_type}) + GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_LC" false ${float_type}) GenerateCombinationObjects("trmm_R.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_RR" false ${float_type}) - GenerateCombinationObjects("trmm_R.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trmm_RC" false ${float_type}) + GenerateCombinationObjects("trmm_R.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_RC" false ${float_type}) GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trsm_LR" false ${float_type}) - GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trsm_LC" false ${float_type}) + GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trsm_LC" false ${float_type}) GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trsm_RR" false ${float_type}) - GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "TRANS;CONJ" 0 "trsm_RC" false ${float_type}) + GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trsm_RC" false ${float_type}) #hemm GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN" 0 "hemm_L" false ${float_type}) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d2cc77b11..bd32544f4 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -155,14 +155,14 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "NC" "gemm_kernel_r" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "CC" "gemm_kernel_b" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;NN" "trmm_kernel_LN" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA;NN" "trmm_kernel_LT" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;CONJ;CN" "trmm_kernel_LR" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA;CONJ;CN" "trmm_kernel_LC" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "NN" "trmm_kernel_RN" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRANSA;NN" "trmm_kernel_RT" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "CONJ;NC" "trmm_kernel_RR" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRANSA;CONJ;NC" "trmm_kernel_RC" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;NN" "trmm_kernel_LN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;TRANSA;NN" "trmm_kernel_LT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;CONJ;CN" "trmm_kernel_LR" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;TRANSA;CONJ;CN" "trmm_kernel_LC" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;NN" "trmm_kernel_RN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;TRANSA;NN" "trmm_kernel_RT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;CONJ;NC" "trmm_kernel_RR" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;TRANSA;CONJ;NC" "trmm_kernel_RC" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL;CONJ" "trsm_kernel_LR" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "UPPER;LT;TRSMKERNEL;CONJ" "trsm_kernel_LC" false "" "" false ${float_type}) @@ -176,62 +176,114 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type}) - else () + # symm for c and z + GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) + + else () #For real GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) - endif () - GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) + # symm for s and d + GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) + + endif () + + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "" "symm_iutcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) if (NOT DEFINED ${float_char}OMATCOPY_CN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") From 96f0bbe067706dae68301f7d049625f2cee689e3 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 21 Oct 2015 02:24:54 +0800 Subject: [PATCH 242/257] Fixed cmake bug on haswell. --- kernel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index bd32544f4..2d7b18973 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -114,7 +114,7 @@ endforeach () # Makefile.L3 set(USE_TRMM false) -if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic") +if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell") set(USE_TRMM true) endif () From 8fade093aaa3748e5e879fcf0fed1833915d7aab Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 20 Oct 2015 14:37:22 -0500 Subject: [PATCH 243/257] Fixed cmake bug on Visual Studio. --- common.h | 3 +- interface/CMakeLists.txt | 55 +++++++++++++++++++------------- interface/zdot.c | 7 +++-- interface/zgemv.c | 3 ++ interface/zsyr.c | 3 ++ kernel/CMakeLists.txt | 68 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 113 insertions(+), 26 deletions(-) diff --git a/common.h b/common.h index 0b0bdb812..d8eadb421 100644 --- a/common.h +++ b/common.h @@ -502,8 +502,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ - (__GNUC__ >= 3 && !defined(__cplusplus)) || \ - _MSC_VER >= 1800) // Visual Studio 2013 supports complex + (__GNUC__ >= 3 && !defined(__cplusplus)) ) #define OPENBLAS_COMPLEX_C99 #ifndef __cplusplus #include diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 91565d2f2..9ff924e5f 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -69,8 +69,8 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) set(DISABLE_COMPLEX 0) set(MANGLE_COMPLEX 3) if (CBLAS_FLAG EQUAL 1) - set(DISABLE_COMPLEX 1) - set(MANGLE_COMPLEX 1) +# set(DISABLE_COMPLEX 1) +# set(MANGLE_COMPLEX 1) endif () GenerateNamedObjects("${BLAS1_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) @@ -98,43 +98,54 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) -endforeach () # complex-specific sources foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") - GenerateNamedObjects("zger.c" "" "geru" false "" "" false ${float_type}) - GenerateNamedObjects("zger.c" "CONJ" "gerc" false "" "" false ${float_type}) - GenerateNamedObjects("zdot.c" "CONJ" "dotc" false "" "" false ${float_type}) - GenerateNamedObjects("zdot.c" "" "dotu" false "" "" false ${float_type}) + GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zdot.c" "" "dotu" ${CBLAS_FLAG} "" "" false ${float_type}) - GenerateNamedObjects("symm.c" "HEMM" "hemm" false "" "" false ${float_type}) - GenerateNamedObjects("syrk.c" "HEMM" "herk" false "" "" false ${float_type}) - GenerateNamedObjects("syr2k.c" "HEMM" "her2k" false "" "" false ${float_type}) + GenerateNamedObjects("symm.c" "HEMM" "hemm" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("syrk.c" "HEMM" "herk" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type}) if (USE_GEMM3M) GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type}) endif() endif () if (${float_type} STREQUAL "COMPLEX") - GenerateNamedObjects("zscal.c" "SSCAL" "sscal" false "" "" false "COMPLEX") - GenerateNamedObjects("nrm2.c" "" "scnrm2" false "" "" true "COMPLEX") - GenerateNamedObjects("zrot.c" "" "csrot" false "" "" true "COMPLEX") - GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" false "" "" true "COMPLEX") - GenerateNamedObjects("max.c" "USE_ABS" "scamax" false "" "" true "COMPLEX") - GenerateNamedObjects("asum.c" "" "scasum" false "" "" true "COMPLEX") + GenerateNamedObjects("zscal.c" "SSCAL" "sscal" ${CBLAS_FLAG} "" "" false "COMPLEX") + GenerateNamedObjects("nrm2.c" "" "scnrm2" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("zrot.c" "" "csrot" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") endif () if (${float_type} STREQUAL "ZCOMPLEX") - GenerateNamedObjects("zscal.c" "SSCAL" "dscal" false "" "" false "ZCOMPLEX") - GenerateNamedObjects("nrm2.c" "" "dznrm2" false "" "" true "ZCOMPLEX") - GenerateNamedObjects("zrot.c" "" "zdrot" false "" "" true "ZCOMPLEX") - GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" false "" "" true "ZCOMPLEX") - GenerateNamedObjects("max.c" "USE_ABS" "dzamax" false "" "" true "ZCOMPLEX") - GenerateNamedObjects("asum.c" "" "dzasum" false "" "" true "ZCOMPLEX") + GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") + GenerateNamedObjects("nrm2.c" "" "dznrm2" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("zrot.c" "" "zdrot" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") endif () endforeach () +endforeach () + +#Special functions for CBLAS +if (NOT DEFINED NO_CBLAS) + foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + #cblas_dotc_sub cblas_dotu_sub + GenerateNamedObjects("zdot.c" "FORCE_USE_STACK" "dotu_sub" 1 "" "" false ${float_type}) + GenerateNamedObjects("zdot.c" "FORCE_USE_STACK;CONJ" "dotc_sub" 1 "" "" false ${float_type}) + endif() + endforeach () +endif() if (NOT DEFINED NO_LAPACK) set(LAPACK_SOURCES diff --git a/interface/zdot.c b/interface/zdot.c index 34dfb731a..d4d0fab92 100644 --- a/interface/zdot.c +++ b/interface/zdot.c @@ -153,16 +153,19 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, OPENBLAS_COMPLEX_FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ OPENBLAS_COMPLEX_FLOAT ret; + OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); #endif PRINT_DEBUG_CNAME; if (n <= 0) { #ifdef FORCE_USE_STACK - *result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); + //*result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); + CREAL(*result) = 0.0; + CIMAG(*result) = 0.0; return; #else - return OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); + return zero; #endif } diff --git a/interface/zgemv.c b/interface/zgemv.c index 792f799e5..520136b45 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -148,6 +148,9 @@ void CNAME(enum CBLAS_ORDER order, blasint info, t; #ifdef SMP int nthreads; + int nthreads_max; + int nthreads_avail; + double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, diff --git a/interface/zsyr.c b/interface/zsyr.c index 5fe29cefa..09b1de578 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -121,6 +121,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO FLOAT *buffer; int trans, uplo; blasint info; + FLOAT * ALPHA = α + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; #ifdef SMP int nthreads; #endif diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 2d7b18973..2156e3993 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -347,6 +347,74 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RTC}" "CONJ;ROWM" "omatcopy_k_rtc" false "" "" false ${float_type}) endif() + #imatcopy + if (NOT DEFINED ${float_char}IMATCOPY_CN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_CN ../generic/zimatcopy_cn.c) + else () + set(${float_char}IMATCOPY_CN ../generic/imatcopy_cn.c) + endif () + endif () + + if (NOT DEFINED ${float_char}IMATCOPY_RN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_RN ../generic/zimatcopy_rn.c) + else () + set(${float_char}IMATCOPY_RN ../generic/imatcopy_rn.c) + endif () + endif () + + if (NOT DEFINED ${float_char}IMATCOPY_CT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_CT ../generic/zimatcopy_ct.c) + else () + set(${float_char}IMATCOPY_CT ../generic/imatcopy_ct.c) + endif () + endif () + + if (NOT DEFINED ${float_char}IMATCOPY_RT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_RT ../generic/zimatcopy_rt.c) + else () + set(${float_char}IMATCOPY_RT ../generic/imatcopy_rt.c) + endif () + endif () + + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CN}" "" "imatcopy_k_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RN}" "ROWM" "imatcopy_k_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CT}" "" "imatcopy_k_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RT}" "ROWM" "imatcopy_k_rt" false "" "" false ${float_type}) + + + if (NOT DEFINED ${float_char}IMATCOPY_CNC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_CNC ../generic/zimatcopy_cnc.c) + endif () + endif () + if (NOT DEFINED ${float_char}IMATCOPY_RNC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_RNC ../generic/zimatcopy_rnc.c) + endif () + endif () + if (NOT DEFINED ${float_char}IMATCOPY_CTC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_CTC ../generic/zimatcopy_ctc.c) + endif () + endif () + if (NOT DEFINED ${float_char}IMATCOPY_RTC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_RTC ../generic/zimatcopy_rtc.c) + endif () + endif () + + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CNC}" "CONJ" "imatcopy_k_cnc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RNC}" "CONJ;ROWM" "imatcopy_k_rnc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CTC}" "CONJ" "imatcopy_k_ctc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RTC}" "CONJ;ROWM" "imatcopy_k_rtc" false "" "" false ${float_type}) + endif() + + #geadd GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () From 1ce054fcb3757e952423a09025c2f2a26023f8a5 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 22 Oct 2015 11:07:35 -0500 Subject: [PATCH 244/257] Refs #669. Fixed the build bug with gcc on Mac OS X. --- driver/others/memory.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index f75a47d65..fca516145 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -139,8 +139,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) +#if defined(OS_DARWIN) && defined(C_GCC) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) +#else #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) +#endif #ifdef DYNAMIC_ARCH gotoblas_t *gotoblas = NULL; From 5a291606adaf425f34dc7223a7775b93518c08cf Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 24 Oct 2015 01:16:34 +0800 Subject: [PATCH 245/257] Refs #671. the return of i?max cannot larger than N. --- interface/imax.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/interface/imax.c b/interface/imax.c index 55ffa7c6e..4378f1e22 100644 --- a/interface/imax.c +++ b/interface/imax.c @@ -136,6 +136,8 @@ blasint NAME(blasint *N, FLOAT *x, blasint *INCX){ ret = (blasint)MAX_K(n, x, incx); + if(ret > n) ret=n; + FUNCTION_PROFILE_END(COMPSIZE, n, 0); IDEBUG_END; @@ -159,6 +161,8 @@ CBLAS_INDEX CNAME(blasint n, FLOAT *x, blasint incx){ ret = MAX_K(n, x, incx); + if (ret > n) ret=n; + if (ret) ret --; FUNCTION_PROFILE_END(COMPSIZE, n, 0); From b809f99ceeeed355dddf61751278a7ab5a74a5f4 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 26 Oct 2015 23:42:21 +0800 Subject: [PATCH 246/257] Add CBLAS test for CMAKE. --- CMakeLists.txt | 4 ++++ ctest/CMakeLists.txt | 44 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e10df13a3..3b436dc13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,11 @@ set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${Open enable_language(ASM) enable_language(C) +if(MSVC) +set(OpenBLAS_LIBNAME libopenblas) +else() set(OpenBLAS_LIBNAME openblas) +endif() ####### if(MSVC) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index d3e15870b..dbe785bcb 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -1,6 +1,46 @@ include_directories(${CMAKE_SOURCE_DIR}) +enable_language(Fortran) -#foreach(test_bin ${OpenBLAS_Tests}) +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") -#endforeach() \ No newline at end of file +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh +"$1 < $2\n" +) + +foreach(float_type ${FLOAT_TYPES}) + string(SUBSTRING ${float_type} 0 1 float_char_upper) + string(TOLOWER ${float_char_upper} float_char) + #level1 + add_executable(x${float_char}cblat1 + c_${float_char}blat1.f + c_${float_char}blas1.c) + target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}_static) + add_test(NAME "x${float_char}cblat1" + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1") + + #level2 + add_executable(x${float_char}cblat2 + c_${float_char}blat2.f + c_${float_char}blas2.c + c_${float_char}2chke.c + auxiliary.c + c_xerbla.c + constant.c) + target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}_static) + add_test(NAME "x${float_char}cblat2" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") + + #level3 + add_executable(x${float_char}cblat3 + c_${float_char}blat3.f + c_${float_char}blas3.c + c_${float_char}3chke.c + auxiliary.c + c_xerbla.c + constant.c) + target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}_static) + add_test(NAME "x${float_char}cblat3" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") + +endforeach() From 309875de3cb752b1a83e6086f54e305ce5d63327 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 27 Oct 2015 02:54:53 +0800 Subject: [PATCH 247/257] Fix cmake bug on x86 32-bit. e.g. Build 32-bit on 64-bit Linux. cmake -DBINARY=32 --- cmake/system.cmake | 6 ++++++ kernel/CMakeLists.txt | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 8ec738a10..71bf5c2cc 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -323,7 +323,13 @@ if (NOT DEFINED COMMON_OPT) set(COMMON_OPT "-O2") endif () +#For x86 32-bit +if (DEFINED BINARY AND BINARY EQUAL 32) + set(COMMON_OPT "${COMMON_OPT} -m32") +endif() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}") +set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}") # TODO: not sure what PFLAGS is -hpa set(PFLAGS "${PFLAGS} ${COMMON_OPT} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 2156e3993..43837a0f3 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -165,9 +165,9 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;TRANSA;CONJ;NC" "trmm_kernel_RC" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL;CONJ" "trsm_kernel_LR" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "UPPER;LT;TRSMKERNEL;CONJ" "trsm_kernel_LC" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "LT;TRSMKERNEL;CONJ" "trsm_kernel_LC" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RR" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RC" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL;CONJ" "trsm_kernel_RC" false "" "" false ${float_type}) #hemm From 53b6023a6cd458eecf22d03361881fda57d85f06 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 26 Oct 2015 14:52:13 -0500 Subject: [PATCH 248/257] Fix cmake bug on MSVC 32-bit. --- cmake/system.cmake | 4 ++++ kernel/CMakeLists.txt | 4 ++++ kernel/x86/cpuid_win.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 kernel/x86/cpuid_win.c diff --git a/cmake/system.cmake b/cmake/system.cmake index 71bf5c2cc..134e9c12d 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -325,11 +325,15 @@ endif () #For x86 32-bit if (DEFINED BINARY AND BINARY EQUAL 32) +if (NOT MSVC) set(COMMON_OPT "${COMMON_OPT} -m32") endif() +endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}") +if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}") +endif() # TODO: not sure what PFLAGS is -hpa set(PFLAGS "${PFLAGS} ${COMMON_OPT} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 43837a0f3..8a3b021cc 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -22,7 +22,11 @@ ParseMakefileVars("${KERNELDIR}/KERNEL") ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") if (${ARCH} STREQUAL "x86") +if (NOT MSVC) GenerateNamedObjects("${KERNELDIR}/cpuid.S" "" "" false "" "" true) +else() + GenerateNamedObjects("${KERNELDIR}/cpuid_win.c" "" "" false "" "" true) +endif() endif () # don't use float type name mangling here diff --git a/kernel/x86/cpuid_win.c b/kernel/x86/cpuid_win.c new file mode 100644 index 000000000..a1b00016b --- /dev/null +++ b/kernel/x86/cpuid_win.c @@ -0,0 +1,41 @@ +/*************************************************************************** +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#if defined(_MSC_VER) && !defined(__clang__) + +#include + +void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) +{ + int cpuInfo[4] = {-1}; + __cpuid(cpuInfo, op); + *eax = cpuInfo[0]; + *ebx = cpuInfo[1]; + *ecx = cpuInfo[2]; + *edx = cpuInfo[3]; +} +#endif From e6d754fddcf44bf471f6abe9cdf2596db5fbe540 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 26 Oct 2015 15:08:17 -0500 Subject: [PATCH 249/257] Use AppVeyor Windows CI. --- appveyor.yml | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 appveyor.yml diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 000000000..9bb712a95 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,42 @@ +version: 0.2.14.{build} + +#environment: + +platform: + - x86 + - x64 +configuration: Release + +clone_folder: c:\projects\OpenBLAS + +init: + - git config --global core.autocrlf input + +build: + project: OpenBLAS.sln + +clone_depth: 5 + +#branches to build +branches: + only: + - master + - develop + - cmake + +skip_tags: true + +matrix: + fast_finish: true + +skip_commits: +# Add [av skip] to commit messages + message: /\[av skip\]/ + +before_build: + - echo Running cmake... + - cd c:\projects\OpenBLAS + - cmake . + +test_script: + From 69363622a896f1e8aef5454351b505509994fe7f Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 27 Oct 2015 05:10:40 +0800 Subject: [PATCH 250/257] Fix DYNAMIC_ARCH=1 bug. --- common_param.h | 50 ++++++++++++------------ kernel/setparam-ref.c | 7 ++++ kernel/x86_64/dtrmm_kernel_4x8_haswell.c | 4 +- 3 files changed, 34 insertions(+), 27 deletions(-) diff --git a/common_param.h b/common_param.h index ab40ddeef..36d6149ea 100644 --- a/common_param.h +++ b/common_param.h @@ -830,56 +830,56 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); + + int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); + + int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + + int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + + int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 0eeac2e1f..a4d1486fc 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -550,6 +550,13 @@ gotoblas_t TABLE_NAME = { zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS, zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS, + simatcopy_k_cnTS, simatcopy_k_ctTS, simatcopy_k_rnTS, simatcopy_k_rtTS, + dimatcopy_k_cnTS, dimatcopy_k_ctTS, dimatcopy_k_rnTS, dimatcopy_k_rtTS, + cimatcopy_k_cnTS, cimatcopy_k_ctTS, cimatcopy_k_rnTS, cimatcopy_k_rtTS, + cimatcopy_k_cncTS, cimatcopy_k_ctcTS, cimatcopy_k_rncTS, cimatcopy_k_rtcTS, + zimatcopy_k_cnTS, zimatcopy_k_ctTS, zimatcopy_k_rnTS, zimatcopy_k_rtTS, + zimatcopy_k_cncTS, zimatcopy_k_ctcTS, zimatcopy_k_rncTS, zimatcopy_k_rtcTS, + sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS }; diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c index 504c784ac..ac8c97d03 100644 --- a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c +++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c @@ -7,7 +7,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) { - BLASLONG I = 0; + BLASLONG i = 0; BLASLONG temp1 = n * 8; __asm__ __volatile__ @@ -110,7 +110,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA : : - "a" (I), // 0 + "a" (i), // 0 "r" (temp1), // 1 "S" (a), // 2 "D" (b), // 3 From 0b2ad98e48b36d4110e37112199c457b5023b511 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 27 Oct 2015 05:11:07 +0800 Subject: [PATCH 251/257] Only test x64 Windows CI. --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 9bb712a95..3a0db79aa 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -3,8 +3,8 @@ version: 0.2.14.{build} #environment: platform: - - x86 - x64 + configuration: Release clone_folder: c:\projects\OpenBLAS From 1ac8c32f1d3a163f7cf2df45b7441e58dd569301 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 26 Oct 2015 18:08:54 -0500 Subject: [PATCH 252/257] [ci skip] Build Visual Studio 12 Win64 on Appveyor --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 3a0db79aa..74e1b00a9 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -36,7 +36,7 @@ skip_commits: before_build: - echo Running cmake... - cd c:\projects\OpenBLAS - - cmake . + - cmake -G "Visual Studio 12 Win64" . test_script: From 79d4a62e10f1f5aa9e38a6fc0a4a695023772c78 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 26 Oct 2015 18:14:41 -0500 Subject: [PATCH 253/257] Add AppVeyor badge. --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 16f874078..0ec86d362 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,9 @@ [![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -[![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) +Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) +AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. From 70642fe4ed4ffd74d305cd5c76cd6425dba4bbd1 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 26 Oct 2015 19:02:51 -0500 Subject: [PATCH 254/257] Refs #668. Raise the signal when pthread_create fails. Thank James K. Lowden for the patch. --- appveyor.yml | 2 +- driver/others/blas_server.c | 29 ++++++++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 74e1b00a9..4daf1bd3d 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -39,4 +39,4 @@ before_build: - cmake -G "Visual Studio 12 Win64" . test_script: - + - echo Build OK! diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 1fd848c6b..b570bcd5a 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,9 +70,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) #include +#include #include +#include #endif #ifndef likely @@ -265,7 +267,7 @@ int get_node(void); static int increased_threads = 0; -static int blas_thread_server(void *arg){ +static void* blas_thread_server(void *arg){ /* Thread identifier */ BLASLONG cpu = (BLASLONG)arg; @@ -458,7 +460,7 @@ static int blas_thread_server(void *arg){ //pthread_exit(NULL); - return 0; + return NULL; } #ifdef MONITOR @@ -565,14 +567,23 @@ int blas_thread_init(void){ #ifdef NEED_STACKATTR ret=pthread_create(&blas_threads[i], &attr, - (void *)&blas_thread_server, (void *)i); + &blas_thread_server, (void *)i); #else ret=pthread_create(&blas_threads[i], NULL, - (void *)&blas_thread_server, (void *)i); + &blas_thread_server, (void *)i); #endif if(ret!=0){ - fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret); - exit(1); + struct rlimit rlim; + const char *msg = strerror(ret); + fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); + if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { + fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " + "%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max)); + } + if(0 != raise(SIGINT)) { + fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n"); + exit(EXIT_FAILURE); + } } } @@ -832,10 +843,10 @@ void goto_set_num_threads(int num_threads) { #ifdef NEED_STACKATTR pthread_create(&blas_threads[i], &attr, - (void *)&blas_thread_server, (void *)i); + &blas_thread_server, (void *)i); #else pthread_create(&blas_threads[i], NULL, - (void *)&blas_thread_server, (void *)i); + &blas_thread_server, (void *)i); #endif } From 6040858b22f6c9a95cd22514f386f0f1f43c16f0 Mon Sep 17 00:00:00 2001 From: j-bo Date: Tue, 27 Oct 2015 13:55:24 +0100 Subject: [PATCH 255/257] Fix #673 Add lacking headers declarations when compiling for Android ARM7 --- driver/others/blas_server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index b570bcd5a..e1c644a80 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) #include #include #include From 63c56d3da93f80f704144750ba7bbf5887bbb5a4 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 27 Oct 2015 10:47:55 -0500 Subject: [PATCH 256/257] Only include complex.h since Android 5.0 --- common.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/common.h b/common.h index d8eadb421..7b81c6fb6 100644 --- a/common.h +++ b/common.h @@ -98,6 +98,10 @@ extern "C" { #ifdef OS_ANDROID #define NO_SYSV_IPC +//Android NDK only supports complex.h since Android 5.0 +#if __ANDROID_API__ < 21 +#define FORCE_OPENBLAS_COMPLEX_STRUCT +#endif #endif #ifdef OS_WINDOWS @@ -501,12 +505,12 @@ static void __inline blas_lock(volatile BLASULONG *address){ /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ - (__GNUC__ >= 3 && !defined(__cplusplus)) ) +#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ + (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) #define OPENBLAS_COMPLEX_C99 -#ifndef __cplusplus - #include -#endif + #ifndef __cplusplus + #include + #endif typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; typedef xdouble _Complex openblas_complex_xdouble; From 8447498b504a3a903507715b3144e8b2a05ec0a0 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 27 Oct 2015 15:44:35 -0500 Subject: [PATCH 257/257] Update doc for OpenBLAS 0.2.15 version. [CI skipped] --- Changelog.txt | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ Makefile.rule | 2 +- appveyor.yml | 2 +- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 6941a9f96..422b8b519 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,57 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.15 +27-Oct-2015 +common: + * Support cmake on x86/x86-64. Natively compiling on MS Visual Studio. + (experimental. Thank Hank Anderson for the initial cmake porting work.) + + On Linux and Mac OSX, OpenBLAS cmake supports assembly kernels. + e.g. cmake . + make + make test (Optional) + + On Windows MS Visual Studio, OpenBLAS cmake only support C kernels. + (OpenBLAS uses AT&T style assembly, which is not supported by MSVC.) + e.g. cmake -G "Visual Studio 12 Win64" . + Open OpenBLAS.sln and build. + + * Enable MAX_STACK_ALLOC flags by default. + Improve ger and gemv for small matrices. + * Improve gemv parallel with small m and large n case. + * Improve ?imatcopy when lda==ldb (#633. Thanks, Martin Koehler) + * Add vecLib benchmarks (#565. Thanks, Andreas Noack.) + * Fix LAPACK lantr for row major matrices (#634. Thanks, Dan Kortschak) + * Fix LAPACKE lansy (#640. Thanks, Dan Kortschak) + * Import bug fixes for LAPACKE s/dormlq, c/zunmlq + * Raise the signal when pthread_create fails (#668. Thanks, James K. Lowden) + * Remove g77 from compiler list. + * Enable AppVeyor Windows CI. + +x86/x86-64: + * Support pure C generic kernels for x86/x86-64. + * Support Intel Boardwell and Skylake by Haswell kernels. + * Support AMD Excavator by Steamroller kernels. + * Optimize s/d/c/zdot for Intel SandyBridge and Haswell. + * Optimize s/d/c/zdot for AMD Piledriver and Steamroller. + * Optimize s/d/c/zapxy for Intel SandyBridge and Haswell. + * Optimize s/d/c/zapxy for AMD Piledriver and Steamroller. + * Optimize d/c/zscal for Intel Haswell, dscal for Intel SandyBridge. + * Optimize d/c/zscal for AMD Bulldozer, Piledriver and Steamroller. + * Optimize s/dger for Intel SandyBridge. + * Optimize s/dsymv for Intel SandyBridge. + * Optimize ssymv for Intel Haswell. + * Optimize dgemv for Intel Nehalem and Haswell. + * Optimize dtrmm for Intel Haswell. + +ARM: + * Support Android NDK armeabi-v7a-hard ABI (-mfloat-abi=hard) + e.g. make HOSTCC=gcc CC=arm-linux-androideabi-gcc NO_LAPACK=1 TARGET=ARMV7 + * Fix lock, rpcc bugs (#616, #617. Thanks, Grazvydas Ignotas) +POWER: + * Support ppc64le platform (ELF ABI v2. #612. Thanks, Matthew Brandyberry.) + * Support POWER7/8 by POWER6 kernels. (#612. Thanks, Fábio Perez.) + ==================================================================== Version 0.2.14 24-Mar-2015 diff --git a/Makefile.rule b/Makefile.rule index 22f222e3f..459f79c26 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.14 +VERSION = 0.2.15 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/appveyor.yml b/appveyor.yml index 4daf1bd3d..394e48854 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -version: 0.2.14.{build} +version: 0.2.15.{build} #environment: