Merge develop in preparation of 0.3.6 releasetags/v0.3.6
@@ -149,7 +149,7 @@ matrix: | |||||
- &test-macos | - &test-macos | ||||
os: osx | os: osx | ||||
osx_image: xcode8 | |||||
osx_image: xcode10.1 | |||||
before_script: | before_script: | ||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | ||||
- brew update | - brew update | ||||
@@ -160,6 +160,7 @@ matrix: | |||||
- BTYPE="BINARY=64 INTERFACE64=1" | - BTYPE="BINARY=64 INTERFACE64=1" | ||||
- <<: *test-macos | - <<: *test-macos | ||||
osx_image: xcode8.3 | |||||
env: | env: | ||||
- BTYPE="BINARY=32" | - BTYPE="BINARY=32" | ||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||||
project(OpenBLAS C ASM) | project(OpenBLAS C ASM) | ||||
set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
set(OpenBLAS_MINOR_VERSION 3) | set(OpenBLAS_MINOR_VERSION 3) | ||||
set(OpenBLAS_PATCH_VERSION 5) | |||||
set(OpenBLAS_PATCH_VERSION 6) | |||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
# Adhere to GNU filesystem layout conventions | # Adhere to GNU filesystem layout conventions | ||||
@@ -42,6 +42,19 @@ endif() | |||||
####### | ####### | ||||
if(MSVC AND MSVC_STATIC_CRT) | |||||
set(CompilerFlags | |||||
CMAKE_CXX_FLAGS | |||||
CMAKE_CXX_FLAGS_DEBUG | |||||
CMAKE_CXX_FLAGS_RELEASE | |||||
CMAKE_C_FLAGS | |||||
CMAKE_C_FLAGS_DEBUG | |||||
CMAKE_C_FLAGS_RELEASE | |||||
) | |||||
foreach(CompilerFlag ${CompilerFlags}) | |||||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||||
endforeach() | |||||
endif() | |||||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | ||||
@@ -62,10 +75,10 @@ endif () | |||||
set(SUBDIRS ${BLASDIRS}) | set(SUBDIRS ${BLASDIRS}) | ||||
if (NOT NO_LAPACK) | if (NOT NO_LAPACK) | ||||
list(APPEND SUBDIRS lapack) | |||||
if(BUILD_RELAPACK) | if(BUILD_RELAPACK) | ||||
list(APPEND SUBDIRS relapack/src) | list(APPEND SUBDIRS relapack/src) | ||||
endif() | endif() | ||||
list(APPEND SUBDIRS lapack) | |||||
endif () | endif () | ||||
# set which float types we want to build for | # set which float types we want to build for | ||||
@@ -134,7 +147,7 @@ endif () | |||||
# Only generate .def for dll on MSVC and always produce pdb files for debug and release | # Only generate .def for dll on MSVC and always produce pdb files for debug and release | ||||
if(MSVC) | if(MSVC) | ||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) | |||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) | |||||
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") | set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") | ||||
endif() | endif() | ||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") | ||||
@@ -149,15 +162,9 @@ if (${DYNAMIC_ARCH}) | |||||
endforeach() | endforeach() | ||||
endif () | endif () | ||||
# Only build shared libs for MSVC | |||||
if (MSVC) | |||||
set(BUILD_SHARED_LIBS ON) | |||||
endif() | |||||
# add objects to the openblas lib | # add objects to the openblas lib | ||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | ||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>) | |||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||||
# Android needs to explicitly link against libm | # Android needs to explicitly link against libm | ||||
if(ANDROID) | if(ANDROID) | ||||
@@ -166,7 +173,7 @@ endif() | |||||
# Handle MSVC exports | # Handle MSVC exports | ||||
if(MSVC AND BUILD_SHARED_LIBS) | if(MSVC AND BUILD_SHARED_LIBS) | ||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) | |||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) | |||||
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | ||||
else() | else() | ||||
# Creates verbose .def file (51KB vs 18KB) | # Creates verbose .def file (51KB vs 18KB) | ||||
@@ -217,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||||
SOVERSION ${OpenBLAS_MAJOR_VERSION} | SOVERSION ${OpenBLAS_MAJOR_VERSION} | ||||
) | ) | ||||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||||
if (NOT MSVC) | |||||
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") | |||||
else() | |||||
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") | |||||
endif() | |||||
endif() | |||||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") | if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") | ||||
if (NOT DEFINED ARCH) | if (NOT DEFINED ARCH) | ||||
set(ARCH_IN "x86_64") | set(ARCH_IN "x86_64") | ||||
@@ -314,7 +329,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||||
if(NOT NOFORTRAN) | if(NOT NOFORTRAN) | ||||
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | ||||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) | |||||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h) | |||||
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") | file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") | ||||
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") | file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") | ||||
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") | file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") | ||||
@@ -327,10 +342,11 @@ endif() | |||||
if(NOT NO_CBLAS) | if(NOT NO_CBLAS) | ||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | ||||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | ||||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | ||||
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") | |||||
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) | |||||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||||
endif() | endif() | ||||
if(NOT NO_LAPACKE) | if(NOT NO_LAPACKE) | ||||
@@ -1,4 +1,82 @@ | |||||
OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
==================================================================== | |||||
Version 0.3.6 | |||||
29-Apr-2019 | |||||
common: | |||||
* the build tools now check that a given cpu TARGET is actually valid | |||||
* the build-time check of system features (c_check) has been made | |||||
less dependent on particular perl features (this should mainly | |||||
benefit building on Windows) | |||||
* several problem with the ReLAPACK integration were fixed, | |||||
including INTERFACE64 support and building a shared library | |||||
* building with CMAKE on BSD systems was improved | |||||
* a non-absolute SUM function was added based on the | |||||
existing optimized code for ASUM | |||||
* CBLAS interfaces to the IxMIN and IxMAX functions were added | |||||
* a name clash between LAPACKE and BOOST headers was resolved | |||||
* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel | |||||
kernels | |||||
* a crash on thread (key) deletion with the USE_TLS=1 memory management | |||||
option was fixed | |||||
* restored several earlier fixes, in particular for OpenMP performance, | |||||
building on BSD, and calling fork on CYGWIN, which had inadvertently | |||||
been dropped in the 0.3.3 rewrite of the memory management code. | |||||
x86_64: | |||||
* the AVX512 DGEMM kernel has been disabled again due to unsolved problems | |||||
* building with old versions of MSVC was fixed | |||||
* it is now possible to build a static library on Windows with CMAKE | |||||
* accessing environment variables on CYGWIN at run time was fixed | |||||
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware | |||||
* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected | |||||
* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported | |||||
with CMAKE as well | |||||
* building for DYNAMIC_ARCH with GENERIC as the default target is now supported | |||||
* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed | |||||
* assembly bugs involving undeclared modification of input operands were fixed | |||||
in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem, | |||||
Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause | |||||
test failures or segfaults when compiled with recent versions of gcc from 8 onward. | |||||
* a similar bug was fixed in the blas_quickdivide code used to split workloads | |||||
in most functions | |||||
* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX | |||||
* fixed building on SkylakeX systems when either the compiler or the (emulated) operating | |||||
environment does not support AVX512 | |||||
* improved GEMM performance on ZEN targets | |||||
x86: | |||||
* build failures caused by the recently added checks for AVX512 were fixed | |||||
* an inline assembly bug involving undeclared modification of an input argument was | |||||
fixed in the blas_quickdivide code used to split workloads in most functions | |||||
* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX | |||||
MIPS32: | |||||
* a bug in the IMIN implementation made it return the result of IMAX | |||||
POWER: | |||||
* single precision BLAS1/2 functions have received optimized POWER8 kernels | |||||
* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel | |||||
* building on PPC970 systems under OSX Leopard or Tiger is now supported | |||||
* out-of-bounds memory accesses in the gemm_beta microkernels were fixed | |||||
* building a shared library on AIX is now supported for POWER6 | |||||
* DYNAMIC_ARCH support has been added for POWER6 and newer | |||||
ARMv7: | |||||
* corrected xDOT behaviour with zero INC_X or INC_Y | |||||
* a bug in the IMIN implementation made it return the result of IMAX | |||||
ARMv8: | |||||
* added support for HiSilicon TSV110 cpus | |||||
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware | |||||
* cross-compilation with CMAKE now works again | |||||
* a bug in the IMIN implementation made it return the result of IMAX | |||||
* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 | |||||
IBM Z: | |||||
* optimized microkernels for single precicion BLAS1/2 functions have been added | |||||
for both Z13 and Z14 | |||||
==================================================================== | ==================================================================== | ||||
Version 0.3.5 | Version 0.3.5 | ||||
31-Dec-2018 | 31-Dec-2018 | ||||
@@ -96,7 +96,7 @@ endif | |||||
@echo | @echo | ||||
shared : | shared : | ||||
ifndef NO_SHARED | |||||
ifneq ($(NO_SHARED), 1) | |||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | ||||
@$(MAKE) -C exports so | @$(MAKE) -C exports so | ||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so | @ln -fs $(LIBSONAME) $(LIBPREFIX).so | ||||
@@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99) | |||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | ||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | ||||
endif | endif | ||||
ifeq ($(CORE), TSV110) | |||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||||
endif |
@@ -58,14 +58,14 @@ ifndef NO_LAPACKE | |||||
endif | endif | ||||
#for install static library | #for install static library | ||||
ifndef NO_STATIC | |||||
ifneq ($(NO_STATIC),1) | |||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | ||||
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | ||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | ||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | ||||
endif | endif | ||||
#for install shared library | #for install shared library | ||||
ifndef NO_SHARED | |||||
ifneq ($(NO_SHARED),1) | |||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | ||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | ||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | ||||
@@ -106,14 +106,14 @@ ifndef NO_LAPACKE | |||||
endif | endif | ||||
#for install static library | #for install static library | ||||
ifndef NO_STATIC | |||||
ifneq ($(NO_STATIC),1) | |||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | ||||
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | ||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | ||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | ||||
endif | endif | ||||
#for install shared library | #for install shared library | ||||
ifndef NO_SHARED | |||||
ifneq ($(NO_SHARED),1) | |||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | ||||
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | ||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | ||||
@@ -138,7 +138,7 @@ endif | |||||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | ||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | ||||
ifndef NO_SHARED | |||||
ifneq ($(NO_SHARED),1) | |||||
#ifeq logical or | #ifeq logical or | ||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) | ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) | ||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | ||||
@@ -9,7 +9,15 @@ else | |||||
USE_OPENMP = 1 | USE_OPENMP = 1 | ||||
endif | endif | ||||
ifeq ($(CORE), POWER9) | |||||
ifeq ($(USE_OPENMP), 1) | |||||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
else | |||||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math | |||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math | |||||
endif | |||||
endif | |||||
ifeq ($(CORE), POWER8) | ifeq ($(CORE), POWER8) | ||||
ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
@@ -3,7 +3,7 @@ | |||||
# | # | ||||
# This library's version | # This library's version | ||||
VERSION = 0.3.5 | |||||
VERSION = 0.3.6 | |||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
@@ -48,6 +48,8 @@ VERSION = 0.3.5 | |||||
# HOSTCC = gcc | # HOSTCC = gcc | ||||
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 | # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 | ||||
# Please note that AVX is not available on 32-bit. | |||||
# Setting BINARY=32 disables AVX/AVX2/AVX-512. | |||||
# BINARY=64 | # BINARY=64 | ||||
# About threaded BLAS. It will be automatically detected if you don't | # About threaded BLAS. It will be automatically detected if you don't | ||||
@@ -57,7 +59,7 @@ VERSION = 0.3.5 | |||||
# USE_THREAD = 0 | # USE_THREAD = 0 | ||||
# If you're going to use this library with OpenMP, please comment it in. | # If you're going to use this library with OpenMP, please comment it in. | ||||
# This flag is always set for POWER8. Don't modify the flag | |||||
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. | |||||
# USE_OPENMP = 1 | # USE_OPENMP = 1 | ||||
# The OpenMP scheduler to use - by default this is "static" and you | # The OpenMP scheduler to use - by default this is "static" and you | ||||
@@ -68,36 +70,45 @@ VERSION = 0.3.5 | |||||
# allow you to select the scheduler from the environment variable OMP_SCHEDULE | # allow you to select the scheduler from the environment variable OMP_SCHEDULE | ||||
# CCOMMON_OPT += -DOMP_SCHED=dynamic | # CCOMMON_OPT += -DOMP_SCHED=dynamic | ||||
# You can define maximum number of threads. Basically it should be | |||||
# less than actual number of cores. If you don't specify one, it's | |||||
# automatically detected by the the script. | |||||
# You can define the maximum number of threads. Basically it should be less | |||||
# than or equal to the number of CPU threads. If you don't specify one, it's | |||||
# automatically detected by the build system. | |||||
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to | |||||
# restrict NUM_THREADS to the number of physical cores. By default, the automatic | |||||
# detection includes logical CPUs, thus allowing the use of SMT. | |||||
# Users may opt at runtime to use less than NUM_THREADS threads. | |||||
# | |||||
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS | |||||
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way | |||||
# some internal structures are allocated, using a large NUM_THREADS value has a RAM | |||||
# footprint penalty, even if users reduce the actual number of threads at runtime. | |||||
# NUM_THREADS = 24 | # NUM_THREADS = 24 | ||||
# If you have enabled USE_OPENMP and your application would call | # If you have enabled USE_OPENMP and your application would call | ||||
# OpenBLAS's calculation API from multi threads, please comment it in. | |||||
# This flag defines how many instances of OpenBLAS's calculation API can | |||||
# actually run in parallel. If more threads call OpenBLAS's calculation API, | |||||
# OpenBLAS's calculation API from multiple threads, please comment this in. | |||||
# This flag defines how many instances of OpenBLAS's calculation API can actually | |||||
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API, | |||||
# they need to wait for the preceding API calls to finish or risk data corruption. | # they need to wait for the preceding API calls to finish or risk data corruption. | ||||
# NUM_PARALLEL = 2 | # NUM_PARALLEL = 2 | ||||
# if you don't need to install the static library, please comment it in. | |||||
# If you don't need to install the static library, please comment this in. | |||||
# NO_STATIC = 1 | # NO_STATIC = 1 | ||||
# if you don't need generate the shared library, please comment it in. | |||||
# If you don't need to generate the shared library, please comment this in. | |||||
# NO_SHARED = 1 | # NO_SHARED = 1 | ||||
# If you don't need CBLAS interface, please comment it in. | |||||
# If you don't need the CBLAS interface, please comment this in. | |||||
# NO_CBLAS = 1 | # NO_CBLAS = 1 | ||||
# If you only want CBLAS interface without installing Fortran compiler, | |||||
# please comment it in. | |||||
# If you only want the CBLAS interface without installing a Fortran compiler, | |||||
# please comment this in. | |||||
# ONLY_CBLAS = 1 | # ONLY_CBLAS = 1 | ||||
# If you don't need LAPACK, please comment it in. | |||||
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. | |||||
# If you don't need LAPACK, please comment this in. | |||||
# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1. | |||||
# NO_LAPACK = 1 | # NO_LAPACK = 1 | ||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in. | |||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment this in. | |||||
# NO_LAPACKE = 1 | # NO_LAPACKE = 1 | ||||
# Build LAPACK Deprecated functions since LAPACK 3.6.0 | # Build LAPACK Deprecated functions since LAPACK 3.6.0 | ||||
@@ -106,7 +117,7 @@ BUILD_LAPACK_DEPRECATED = 1 | |||||
# Build RecursiveLAPACK on top of LAPACK | # Build RecursiveLAPACK on top of LAPACK | ||||
# BUILD_RELAPACK = 1 | # BUILD_RELAPACK = 1 | ||||
# If you want to use legacy threaded Level 3 implementation. | |||||
# If you want to use the legacy threaded Level 3 implementation. | |||||
# USE_SIMPLE_THREADED_LEVEL3 = 1 | # USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
# If you want to use the new, still somewhat experimental code that uses | # If you want to use the new, still somewhat experimental code that uses | ||||
@@ -116,8 +127,8 @@ BUILD_LAPACK_DEPRECATED = 1 | |||||
# USE_TLS = 1 | # USE_TLS = 1 | ||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran | # If you want to drive whole 64bit region by BLAS. Not all Fortran | ||||
# compiler supports this. It's safe to keep comment it out if you | |||||
# are not sure(equivalent to "-i8" option). | |||||
# compilers support this. It's safe to keep this commented out if you | |||||
# are not sure. (This is equivalent to the "-i8" ifort option). | |||||
# INTERFACE64 = 1 | # INTERFACE64 = 1 | ||||
# Unfortunately most of kernel won't give us high quality buffer. | # Unfortunately most of kernel won't give us high quality buffer. | ||||
@@ -125,10 +136,18 @@ BUILD_LAPACK_DEPRECATED = 1 | |||||
# but it will consume time. If you don't like it, you can disable one. | # but it will consume time. If you don't like it, you can disable one. | ||||
NO_WARMUP = 1 | NO_WARMUP = 1 | ||||
# If you want to disable CPU/Memory affinity on Linux. | |||||
# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling. | |||||
# This feature is only implemented on Linux, and is always disabled on other platforms. | |||||
# Enabling affinity handling may improve performance, especially on NUMA systems, but | |||||
# it may conflict with certain applications that also try to manage affinity. | |||||
# This conflict can result in threads of the application calling OpenBLAS ending up locked | |||||
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core. | |||||
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing | |||||
# else modifies affinity settings. | |||||
# Note: enabling affinity has been known to cause problems with NumPy and R | |||||
NO_AFFINITY = 1 | NO_AFFINITY = 1 | ||||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||||
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||||
# BIGNUMA = 1 | # BIGNUMA = 1 | ||||
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers | # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers | ||||
@@ -180,7 +199,7 @@ NO_AFFINITY = 1 | |||||
# been reported to be optimal for certain workloads (50 is the recommended value for Julia). | # been reported to be optimal for certain workloads (50 is the recommended value for Julia). | ||||
# GEMM_MULTITHREAD_THRESHOLD = 4 | # GEMM_MULTITHREAD_THRESHOLD = 4 | ||||
# If you need santy check by comparing reference BLAS. It'll be very | |||||
# If you need sanity check by comparing results to reference BLAS. It'll be very | |||||
# slow (Not implemented yet). | # slow (Not implemented yet). | ||||
# SANITY_CHECK = 1 | # SANITY_CHECK = 1 | ||||
@@ -65,6 +65,7 @@ endif | |||||
ifdef TARGET | ifdef TARGET | ||||
GETARCH_FLAGS := -DFORCE_$(TARGET) | GETARCH_FLAGS := -DFORCE_$(TARGET) | ||||
GETARCH_FLAGS += -DUSER_TARGET | |||||
endif | endif | ||||
# Force fallbacks for 32bit | # Force fallbacks for 32bit | ||||
@@ -94,6 +95,9 @@ endif | |||||
ifeq ($(TARGET), ZEN) | ifeq ($(TARGET), ZEN) | ||||
GETARCH_FLAGS := -DFORCE_BARCELONA | GETARCH_FLAGS := -DFORCE_BARCELONA | ||||
endif | endif | ||||
ifeq ($(TARGET), ARMV8) | |||||
GETARCH_FLAGS := -DFORCE_ARMV7 | |||||
endif | |||||
endif | endif | ||||
@@ -151,7 +155,8 @@ GETARCH_FLAGS += -DNO_AVX | |||||
endif | endif | ||||
ifeq ($(BINARY), 32) | ifeq ($(BINARY), 32) | ||||
GETARCH_FLAGS += -DNO_AVX | |||||
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 | |||||
NO_AVX512 = 1 | |||||
endif | endif | ||||
ifeq ($(NO_AVX2), 1) | ifeq ($(NO_AVX2), 1) | ||||
@@ -523,6 +528,12 @@ DYNAMIC_CORE += THUNDERX | |||||
DYNAMIC_CORE += THUNDERX2T99 | DYNAMIC_CORE += THUNDERX2T99 | ||||
endif | endif | ||||
ifeq ($(ARCH), power) | |||||
DYNAMIC_CORE = POWER6 | |||||
DYNAMIC_CORE += POWER8 | |||||
DYNAMIC_CORE += POWER9 | |||||
endif | |||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | ||||
ifndef DYNAMIC_CORE | ifndef DYNAMIC_CORE | ||||
override DYNAMIC_ARCH= | override DYNAMIC_ARCH= | ||||
@@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector | |||||
FCOMMON_OPT += -march=z13 -mzvector | FCOMMON_OPT += -march=z13 -mzvector | ||||
endif | endif | ||||
ifeq ($(CORE), Z14) | |||||
CCOMMON_OPT += -march=z14 -mzvector | |||||
FCOMMON_OPT += -march=z14 -mzvector | |||||
endif |
@@ -48,6 +48,7 @@ POWER5 | |||||
POWER6 | POWER6 | ||||
POWER7 | POWER7 | ||||
POWER8 | POWER8 | ||||
POWER9 | |||||
PPCG4 | PPCG4 | ||||
PPC970 | PPC970 | ||||
PPC970MP | PPC970MP | ||||
@@ -90,7 +91,9 @@ CORTEXA73 | |||||
FALKOR | FALKOR | ||||
THUNDERX | THUNDERX | ||||
THUNDERX2T99 | THUNDERX2T99 | ||||
TSV110 | |||||
9.System Z: | 9.System Z: | ||||
ZARCH_GENERIC | ZARCH_GENERIC | ||||
Z13 | Z13 | ||||
Z14 |
@@ -53,9 +53,9 @@ before_build: | |||||
- ps: if (-Not (Test-Path .\build)) { mkdir build } | - ps: if (-Not (Test-Path .\build)) { mkdir build } | ||||
- cd build | - cd build | ||||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. | - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. | ||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. | |||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | ||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. | |||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | |||||
build_script: | build_script: | ||||
- cmake --build . | - cmake --build . | ||||
@@ -2,6 +2,8 @@ | |||||
argv <- commandArgs(trailingOnly = TRUE) | argv <- commandArgs(trailingOnly = TRUE) | ||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||||
nfrom <- 128 | nfrom <- 128 | ||||
nto <- 2048 | nto <- 2048 | ||||
nstep <- 128 | nstep <- 128 | ||||
@@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||||
loops <- as.numeric(argv[z]) | loops <- as.numeric(argv[z]) | ||||
} | } | ||||
} | } | ||||
} | } | ||||
p <- Sys.getenv("OPENBLAS_LOOPS") | p <- Sys.getenv("OPENBLAS_LOOPS") | ||||
@@ -27,29 +28,21 @@ if (p != "") { | |||||
loops <- as.numeric(p) | loops <- as.numeric(p) | ||||
} | } | ||||
cat(sprintf( | |||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||||
nfrom, | |||||
nto, | |||||
nstep, | |||||
loops | |||||
)) | |||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||||
cat(sprintf(" SIZE Flops Time\n")) | cat(sprintf(" SIZE Flops Time\n")) | ||||
n <- nfrom | n <- nfrom | ||||
while (n <= nto) { | while (n <= nto) { | ||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||||
A <- matrix(rnorm(n * n), nrow = n) | |||||
ev <- 0 | ev <- 0 | ||||
z <- system.time(for (l in 1:loops) { | z <- system.time(for (l in 1:loops) { | ||||
ev <- eigen(A) | ev <- eigen(A) | ||||
}) | }) | ||||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) | |||||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06) | |||||
st <- sprintf("%.0fx%.0f :", n, n) | st <- sprintf("%.0fx%.0f :", n, n) | ||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | ||||
n <- n + nstep | n <- n + nstep | ||||
} | } |
@@ -2,6 +2,8 @@ | |||||
argv <- commandArgs(trailingOnly = TRUE) | argv <- commandArgs(trailingOnly = TRUE) | ||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||||
nfrom <- 128 | nfrom <- 128 | ||||
nto <- 2048 | nto <- 2048 | ||||
nstep <- 128 | nstep <- 128 | ||||
@@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||||
loops <- as.numeric(argv[z]) | loops <- as.numeric(argv[z]) | ||||
} | } | ||||
} | } | ||||
} | } | ||||
p <- Sys.getenv("OPENBLAS_LOOPS") | p <- Sys.getenv("OPENBLAS_LOOPS") | ||||
@@ -27,26 +28,13 @@ if (p != "") { | |||||
loops <- as.numeric(p) | loops <- as.numeric(p) | ||||
} | } | ||||
cat(sprintf( | |||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||||
nfrom, | |||||
nto, | |||||
nstep, | |||||
loops | |||||
)) | |||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||||
cat(sprintf(" SIZE Flops Time\n")) | cat(sprintf(" SIZE Flops Time\n")) | ||||
n <- nfrom | n <- nfrom | ||||
while (n <= nto) { | while (n <= nto) { | ||||
A <- matrix(runif(n * n), | |||||
ncol = n, | |||||
nrow = n, | |||||
byrow = TRUE) | |||||
B <- matrix(runif(n * n), | |||||
ncol = n, | |||||
nrow = n, | |||||
byrow = TRUE) | |||||
A <- matrix(runif(n * n), nrow = n) | |||||
B <- matrix(runif(n * n), nrow = n) | |||||
C <- 1 | C <- 1 | ||||
z <- system.time(for (l in 1:loops) { | z <- system.time(for (l in 1:loops) { | ||||
@@ -54,11 +42,10 @@ while (n <= nto) { | |||||
l <- l + 1 | l <- l + 1 | ||||
}) | }) | ||||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) | |||||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06) | |||||
st <- sprintf("%.0fx%.0f :", n, n) | st <- sprintf("%.0fx%.0f :", n, n) | ||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | ||||
n <- n + nstep | n <- n + nstep | ||||
} | } |
@@ -2,6 +2,8 @@ | |||||
argv <- commandArgs(trailingOnly = TRUE) | argv <- commandArgs(trailingOnly = TRUE) | ||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||||
nfrom <- 128 | nfrom <- 128 | ||||
nto <- 2048 | nto <- 2048 | ||||
nstep <- 128 | nstep <- 128 | ||||
@@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||||
loops <- as.numeric(argv[z]) | loops <- as.numeric(argv[z]) | ||||
} | } | ||||
} | } | ||||
} | } | ||||
p <- Sys.getenv("OPENBLAS_LOOPS") | p <- Sys.getenv("OPENBLAS_LOOPS") | ||||
@@ -27,31 +28,22 @@ if (p != "") { | |||||
loops <- as.numeric(p) | loops <- as.numeric(p) | ||||
} | } | ||||
cat(sprintf( | |||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||||
nfrom, | |||||
nto, | |||||
nstep, | |||||
loops | |||||
)) | |||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||||
cat(sprintf(" SIZE Flops Time\n")) | cat(sprintf(" SIZE Flops Time\n")) | ||||
n <- nfrom | n <- nfrom | ||||
while (n <= nto) { | while (n <= nto) { | ||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||||
B <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||||
A <- matrix(rnorm(n * n), nrow = n) | |||||
B <- matrix(rnorm(n * n), nrow = n) | |||||
z <- system.time(for (l in 1:loops) { | z <- system.time(for (l in 1:loops) { | ||||
solve(A, B) | solve(A, B) | ||||
}) | }) | ||||
mflops <- | |||||
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) | |||||
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06) | |||||
st <- sprintf("%.0fx%.0f :", n, n) | st <- sprintf("%.0fx%.0f :", n, n) | ||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | ||||
n <- n + nstep | n <- n + nstep | ||||
} | } |
@@ -1,7 +1,7 @@ | |||||
#!/usr/bin/perl | #!/usr/bin/perl | ||||
use File::Basename; | |||||
use File::Temp qw(tempfile); | |||||
#use File::Basename; | |||||
# use File::Temp qw(tempfile); | |||||
# Checking cross compile | # Checking cross compile | ||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | ||||
@@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64"); | |||||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | ||||
$hostarch = "zarch" if ($hostarch eq "s390x"); | $hostarch = "zarch" if ($hostarch eq "s390x"); | ||||
$tmpf = new File::Temp( UNLINK => 1 ); | |||||
#$tmpf = new File::Temp( UNLINK => 1 ); | |||||
$binary = $ENV{"BINARY"}; | $binary = $ENV{"BINARY"}; | ||||
$makefile = shift(@ARGV); | $makefile = shift(@ARGV); | ||||
@@ -31,12 +31,25 @@ if ($?) { | |||||
$cross_suffix = ""; | $cross_suffix = ""; | ||||
if (dirname($compiler_name) ne ".") { | |||||
$cross_suffix .= dirname($compiler_name) . "/"; | |||||
} | |||||
eval "use File::Basename"; | |||||
if ($@){ | |||||
warn "could not load PERL module File::Basename, emulating its functionality"; | |||||
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 ); | |||||
if ($dirnam ne ".") { | |||||
$cross_suffix .= $dirnam . "/"; | |||||
} | |||||
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1); | |||||
if ($basnam =~ /([^\s]*-)(.*)/) { | |||||
$cross_suffix .= $1; | |||||
} | |||||
} else { | |||||
if (dirname($compiler_name) ne ".") { | |||||
$cross_suffix .= dirname($compiler_name) . "/"; | |||||
} | |||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||||
$cross_suffix .= $1; | |||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||||
$cross_suffix .= $1; | |||||
} | |||||
} | } | ||||
$compiler = ""; | $compiler = ""; | ||||
@@ -171,20 +184,26 @@ if ($?) { | |||||
$have_msa = 0; | $have_msa = 0; | ||||
if (($architecture eq "mips") || ($architecture eq "mips64")) { | if (($architecture eq "mips") || ($architecture eq "mips64")) { | ||||
$code = '"addvi.b $w0, $w1, 1"'; | |||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||||
print $tmpf "#include <msa.h>\n\n"; | |||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf"; | |||||
my @cmd = ("$compiler_name $args"); | |||||
system(@cmd) == 0; | |||||
if ($? != 0) { | |||||
$have_msa = 0; | |||||
eval "use File::Temp qw(tempfile)"; | |||||
if ($@){ | |||||
warn "could not load PERL module File::Temp, so could not check MSA capatibility"; | |||||
} else { | } else { | ||||
$have_msa = 1; | |||||
$tmpf = new File::Temp( UNLINK => 1 ); | |||||
$code = '"addvi.b $w0, $w1, 1"'; | |||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||||
print $tmpf "#include <msa.h>\n\n"; | |||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf"; | |||||
my @cmd = ("$compiler_name $args"); | |||||
system(@cmd) == 0; | |||||
if ($? != 0) { | |||||
$have_msa = 0; | |||||
} else { | |||||
$have_msa = 1; | |||||
} | |||||
unlink("$tmpf.o"); | |||||
} | } | ||||
unlink("$tmpf.o"); | |||||
} | } | ||||
$architecture = x86 if ($data =~ /ARCH_X86/); | $architecture = x86 if ($data =~ /ARCH_X86/); | ||||
@@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/); | |||||
$no_avx512= 0; | $no_avx512= 0; | ||||
if (($architecture eq "x86") || ($architecture eq "x86_64")) { | if (($architecture eq "x86") || ($architecture eq "x86_64")) { | ||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||||
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; | |||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); | |||||
system(@cmd) == 0; | |||||
if ($? != 0) { | |||||
$no_avx512 = 1; | |||||
} else { | |||||
eval "use File::Temp qw(tempfile)"; | |||||
if ($@){ | |||||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512"; | |||||
$no_avx512 = 0; | $no_avx512 = 0; | ||||
} else { | |||||
# $tmpf = new File::Temp( UNLINK => 1 ); | |||||
($fh,$tmpf) = tempfile( UNLINK => 1 ); | |||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||||
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf"; | |||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); | |||||
system(@cmd) == 0; | |||||
if ($? != 0) { | |||||
$no_avx512 = 1; | |||||
} else { | |||||
$no_avx512 = 0; | |||||
} | |||||
unlink("tmpf.o"); | |||||
} | } | ||||
unlink("tmpf.o"); | |||||
} | } | ||||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | ||||
@@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS | |||||
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||||
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||||
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); | float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); | ||||
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); | double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); | ||||
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); | float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); | ||||
@@ -88,6 +93,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||||
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||||
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||||
CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||||
CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | ||||
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | ||||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | ||||
@@ -74,6 +74,9 @@ if (DYNAMIC_ARCH) | |||||
if (NOT NO_AVX512) | if (NOT NO_AVX512) | ||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) | set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) | ||||
endif () | endif () | ||||
if (DYNAMIC_LIST) | |||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) | |||||
endif () | |||||
endif () | endif () | ||||
if (NOT DYNAMIC_CORE) | if (NOT DYNAMIC_CORE) | ||||
@@ -107,6 +107,12 @@ macro(SetDefaultL1) | |||||
set(DAXPBYKERNEL ../arm/axpby.c) | set(DAXPBYKERNEL ../arm/axpby.c) | ||||
set(CAXPBYKERNEL ../arm/zaxpby.c) | set(CAXPBYKERNEL ../arm/zaxpby.c) | ||||
set(ZAXPBYKERNEL ../arm/zaxpby.c) | set(ZAXPBYKERNEL ../arm/zaxpby.c) | ||||
set(SSUMKERNEL sum.S) | |||||
set(DSUMKERNEL sum.S) | |||||
set(CSUMKERNEL zsum.S) | |||||
set(ZSUMKERNEL zsum.S) | |||||
set(QSUMKERNEL sum.S) | |||||
set(XSUMKERNEL zsum.S) | |||||
endmacro () | endmacro () | ||||
macro(SetDefaultL2) | macro(SetDefaultL2) | ||||
@@ -162,4 +168,4 @@ macro(SetDefaultL3) | |||||
set(DGEADD_KERNEL ../generic/geadd.c) | set(DGEADD_KERNEL ../generic/geadd.c) | ||||
set(CGEADD_KERNEL ../generic/zgeadd.c) | set(CGEADD_KERNEL ../generic/zgeadd.c) | ||||
set(ZGEADD_KERNEL ../generic/zgeadd.c) | set(ZGEADD_KERNEL ../generic/zgeadd.c) | ||||
endmacro () | |||||
endmacro () |
@@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") | |||||
set(NO_EXPRECISION 1) | set(NO_EXPRECISION 1) | ||||
endif () | endif () | ||||
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") | |||||
set(EXTRALIB "${EXTRALIB} -lm") | |||||
set(NO_EXPRECISION 1) | |||||
endif () | |||||
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") | if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") | ||||
set(EXTRALIB "${EXTRALIB} -lm") | set(EXTRALIB "${EXTRALIB} -lm") | ||||
endif () | endif () | ||||
@@ -87,13 +87,18 @@ endif () | |||||
# Cannot run getarch on target if we are cross-compiling | # Cannot run getarch on target if we are cross-compiling | ||||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) | if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) | ||||
# Write to config as getarch would | # Write to config as getarch would | ||||
if (DEFINED TARGET_CORE) | |||||
set(TCORE ${TARGET_CORE}) | |||||
else() | |||||
set(TCORE ${CORE}) | |||||
endif() | |||||
# TODO: Set up defines that getarch sets up based on every other target | # TODO: Set up defines that getarch sets up based on every other target | ||||
# Perhaps this should be inside a different file as it grows larger | # Perhaps this should be inside a different file as it grows larger | ||||
file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
"#define ${CORE}\n" | |||||
"#define CHAR_CORENAME \"${CORE}\"\n") | |||||
if ("${CORE}" STREQUAL "ARMV7") | |||||
"#define ${TCORE}\n" | |||||
"#define CHAR_CORENAME \"${TCORE}\"\n") | |||||
if ("${TCORE}" STREQUAL "ARMV7") | |||||
file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
"#define L1_DATA_SIZE\t65536\n" | "#define L1_DATA_SIZE\t65536\n" | ||||
"#define L1_DATA_LINESIZE\t32\n" | "#define L1_DATA_LINESIZE\t32\n" | ||||
@@ -108,7 +113,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||||
set(SGEMM_UNROLL_N 4) | set(SGEMM_UNROLL_N 4) | ||||
set(DGEMM_UNROLL_M 4) | set(DGEMM_UNROLL_M 4) | ||||
set(DGEMM_UNROLL_N 4) | set(DGEMM_UNROLL_N 4) | ||||
elseif ("${CORE}" STREQUAL "ARMV8") | |||||
elseif ("${TCORE}" STREQUAL "ARMV8") | |||||
file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
"#define L1_DATA_SIZE\t32768\n" | "#define L1_DATA_SIZE\t32768\n" | ||||
"#define L1_DATA_LINESIZE\t64\n" | "#define L1_DATA_LINESIZE\t64\n" | ||||
@@ -118,9 +123,16 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||||
"#define DTB_SIZE\t4096\n" | "#define DTB_SIZE\t4096\n" | ||||
"#define L2_ASSOCIATIVE\t32\n" | "#define L2_ASSOCIATIVE\t32\n" | ||||
"#define ARMV8\n") | "#define ARMV8\n") | ||||
set(SGEMM_UNROLL_M 4) | |||||
set(SGEMM_UNROLL_M 16) | |||||
set(SGEMM_UNROLL_N 4) | set(SGEMM_UNROLL_N 4) | ||||
elseif ("${CORE}" STREQUAL "CORTEXA57" OR "${CORE}" STREQUAL "CORTEXA53") | |||||
set(DGEMM_UNROLL_M 8) | |||||
set(DGEMM_UNROLL_N 4) | |||||
set(CGEMM_UNROLL_M 8) | |||||
set(CGEMM_UNROLL_N 4) | |||||
set(ZGEMM_UNROLL_M 4) | |||||
set(ZGEMM_UNROLL_N 4) | |||||
set(SYMV_P 16) | |||||
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") | |||||
file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
"#define L1_CODE_SIZE\t32768\n" | "#define L1_CODE_SIZE\t32768\n" | ||||
"#define L1_CODE_LINESIZE\t64\n" | "#define L1_CODE_LINESIZE\t64\n" | ||||
@@ -144,9 +156,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||||
set(DGEMM_UNROLL_N 4) | set(DGEMM_UNROLL_N 4) | ||||
set(CGEMM_UNROLL_M 8) | set(CGEMM_UNROLL_M 8) | ||||
set(CGEMM_UNROLL_N 4) | set(CGEMM_UNROLL_N 4) | ||||
set(ZGEMM_UNROLL_M 8) | |||||
set(ZGEMM_UNROLL_M 4) | |||||
set(ZGEMM_UNROLL_N 4) | set(ZGEMM_UNROLL_N 4) | ||||
elseif ("${CORE}" STREQUAL "CORTEXA72" OR "${CORE}" STREQUAL "CORTEXA73") | |||||
set(SYMV_P 16) | |||||
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73") | |||||
file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
"#define L1_CODE_SIZE\t49152\n" | "#define L1_CODE_SIZE\t49152\n" | ||||
"#define L1_CODE_LINESIZE\t64\n" | "#define L1_CODE_LINESIZE\t64\n" | ||||
@@ -170,9 +183,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||||
set(DGEMM_UNROLL_N 4) | set(DGEMM_UNROLL_N 4) | ||||
set(CGEMM_UNROLL_M 8) | set(CGEMM_UNROLL_M 8) | ||||
set(CGEMM_UNROLL_N 4) | set(CGEMM_UNROLL_N 4) | ||||
set(ZGEMM_UNROLL_M 8) | |||||
set(ZGEMM_UNROLL_M 4) | |||||
set(ZGEMM_UNROLL_N 4) | set(ZGEMM_UNROLL_N 4) | ||||
elseif ("${CORE}" STREQUAL "FALKOR") | |||||
set(SYMV_P 16) | |||||
elseif ("${TCORE}" STREQUAL "FALKOR") | |||||
file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
"#define L1_CODE_SIZE\t65536\n" | "#define L1_CODE_SIZE\t65536\n" | ||||
"#define L1_CODE_LINESIZE\t64\n" | "#define L1_CODE_LINESIZE\t64\n" | ||||
@@ -196,9 +210,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||||
set(DGEMM_UNROLL_N 4) | set(DGEMM_UNROLL_N 4) | ||||
set(CGEMM_UNROLL_M 8) | set(CGEMM_UNROLL_M 8) | ||||
set(CGEMM_UNROLL_N 4) | set(CGEMM_UNROLL_N 4) | ||||
set(ZGEMM_UNROLL_M 8) | |||||
set(ZGEMM_UNROLL_M 4) | |||||
set(ZGEMM_UNROLL_N 4) | set(ZGEMM_UNROLL_N 4) | ||||
elseif ("${CORE}" STREQUAL "THUNDERX) | |||||
set(SYMV_P 16) | |||||
elseif ("${TCORE}" STREQUAL "THUNDERX") | |||||
file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
"#define L1_CODE_SIZE\t32768\n" | "#define L1_CODE_SIZE\t32768\n" | ||||
"#define L1_CODE_LINESIZE\t64\n" | "#define L1_CODE_LINESIZE\t64\n" | ||||
@@ -224,7 +239,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||||
set(CGEMM_UNROLL_N 2) | set(CGEMM_UNROLL_N 2) | ||||
set(ZGEMM_UNROLL_M 2) | set(ZGEMM_UNROLL_M 2) | ||||
set(ZGEMM_UNROLL_N 2) | set(ZGEMM_UNROLL_N 2) | ||||
elseif ("${CORE}" STREQUAL "THUNDERX2T99) | |||||
set(SYMV_P 16) | |||||
elseif ("${TCORE}" STREQUAL "THUNDERX2T99") | |||||
file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
"#define L1_CODE_SIZE\t32768\n" | "#define L1_CODE_SIZE\t32768\n" | ||||
"#define L1_CODE_LINESIZE\t64\n" | "#define L1_CODE_LINESIZE\t64\n" | ||||
@@ -240,7 +256,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||||
"#define L3_ASSOCIATIVE\t32\n" | "#define L3_ASSOCIATIVE\t32\n" | ||||
"#define DTB_DEFAULT_ENTRIES\t64\n" | "#define DTB_DEFAULT_ENTRIES\t64\n" | ||||
"#define DTB_SIZE\t4096\n" | "#define DTB_SIZE\t4096\n" | ||||
"#define VULCAN\n") | |||||
"#define ARMV8\n") | |||||
set(SGEMM_UNROLL_M 16) | set(SGEMM_UNROLL_M 16) | ||||
set(SGEMM_UNROLL_N 4) | set(SGEMM_UNROLL_N 4) | ||||
set(DGEMM_UNROLL_M 8) | set(DGEMM_UNROLL_M 8) | ||||
@@ -249,6 +265,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||||
set(CGEMM_UNROLL_N 4) | set(CGEMM_UNROLL_N 4) | ||||
set(ZGEMM_UNROLL_M 4) | set(ZGEMM_UNROLL_M 4) | ||||
set(ZGEMM_UNROLL_N 4) | set(ZGEMM_UNROLL_N 4) | ||||
set(SYMV_P 16) | |||||
endif() | endif() | ||||
# Or should this actually be NUM_CORES? | # Or should this actually be NUM_CORES? | ||||
@@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | ||||
set(TARGET "BARCELONA") | set(TARGET "BARCELONA") | ||||
endif () | endif () | ||||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") | |||||
set(TARGET "ARMV7") | |||||
endif () | |||||
endif () | endif () | ||||
if (DEFINED TARGET) | if (DEFINED TARGET) | ||||
@@ -184,6 +187,13 @@ if (DYNAMIC_ARCH) | |||||
endif () | endif () | ||||
endif () | endif () | ||||
if (DYNAMIC_LIST) | |||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST") | |||||
foreach(DCORE ${DYNAMIC_LIST}) | |||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}") | |||||
endforeach () | |||||
endif () | |||||
if (NO_LAPACK) | if (NO_LAPACK) | ||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") | set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") | ||||
#Disable LAPACK C interface | #Disable LAPACK C interface | ||||
@@ -39,13 +39,21 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") | |||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | ||||
set(MIPS64 1) | set(MIPS64 1) | ||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | ||||
set(X86_64 1) | |||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||||
set(X86_64 1) | |||||
else() | |||||
set(X86 1) | |||||
endif() | |||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") | ||||
set(X86 1) | set(X86 1) | ||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") | ||||
set(ARM 1) | set(ARM 1) | ||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") | ||||
set(ARM64 1) | |||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||||
set(ARM64 1) | |||||
else() | |||||
set(ARM 1) | |||||
endif() | |||||
endif() | endif() | ||||
if (X86_64) | if (X86_64) | ||||
@@ -78,7 +86,7 @@ endif() | |||||
if (X86_64 OR X86) | if (X86_64 OR X86) | ||||
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") | file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") | ||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||||
if (NO_AVX512 EQUAL 1) | if (NO_AVX512 EQUAL 1) | ||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | ||||
endif() | endif() | ||||
@@ -85,6 +85,8 @@ extern "C" { | |||||
#if !defined(_MSC_VER) | #if !defined(_MSC_VER) | ||||
#include <unistd.h> | #include <unistd.h> | ||||
#elif _MSC_VER < 1900 | |||||
#define snprintf _snprintf | |||||
#endif | #endif | ||||
#include <time.h> | #include <time.h> | ||||
@@ -348,6 +350,11 @@ typedef int blasint; | |||||
#endif | #endif | ||||
#endif | #endif | ||||
#ifdef POWER9 | |||||
#ifndef YIELDING | |||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||||
#endif | |||||
#endif | |||||
/* | /* | ||||
#ifdef PILEDRIVER | #ifdef PILEDRIVER | ||||
@@ -439,7 +446,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||||
typedef char env_var_t[MAX_PATH]; | typedef char env_var_t[MAX_PATH]; | ||||
#define readenv(p, n) 0 | #define readenv(p, n) 0 | ||||
#else | #else | ||||
#ifdef OS_WINDOWS | |||||
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) | |||||
typedef char env_var_t[MAX_PATH]; | typedef char env_var_t[MAX_PATH]; | ||||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) | #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) | ||||
#else | #else | ||||
@@ -19,6 +19,7 @@ | |||||
#define CDOTC_K cdotc_k | #define CDOTC_K cdotc_k | ||||
#define CNRM2_K cnrm2_k | #define CNRM2_K cnrm2_k | ||||
#define CSCAL_K cscal_k | #define CSCAL_K cscal_k | ||||
#define CSUM_K csum_k | |||||
#define CSWAP_K cswap_k | #define CSWAP_K cswap_k | ||||
#define CROT_K csrot_k | #define CROT_K csrot_k | ||||
@@ -249,6 +250,7 @@ | |||||
#define CDOTC_K gotoblas -> cdotc_k | #define CDOTC_K gotoblas -> cdotc_k | ||||
#define CNRM2_K gotoblas -> cnrm2_k | #define CNRM2_K gotoblas -> cnrm2_k | ||||
#define CSCAL_K gotoblas -> cscal_k | #define CSCAL_K gotoblas -> cscal_k | ||||
#define CSUM_K gotoblas -> csum_k | |||||
#define CSWAP_K gotoblas -> cswap_k | #define CSWAP_K gotoblas -> cswap_k | ||||
#define CROT_K gotoblas -> csrot_k | #define CROT_K gotoblas -> csrot_k | ||||
@@ -19,6 +19,7 @@ | |||||
#define DDOTC_K ddot_k | #define DDOTC_K ddot_k | ||||
#define DNRM2_K dnrm2_k | #define DNRM2_K dnrm2_k | ||||
#define DSCAL_K dscal_k | #define DSCAL_K dscal_k | ||||
#define DSUM_K dsum_k | |||||
#define DSWAP_K dswap_k | #define DSWAP_K dswap_k | ||||
#define DROT_K drot_k | #define DROT_K drot_k | ||||
@@ -174,6 +175,7 @@ | |||||
#define DDOTC_K gotoblas -> ddot_k | #define DDOTC_K gotoblas -> ddot_k | ||||
#define DNRM2_K gotoblas -> dnrm2_k | #define DNRM2_K gotoblas -> dnrm2_k | ||||
#define DSCAL_K gotoblas -> dscal_k | #define DSCAL_K gotoblas -> dscal_k | ||||
#define DSUM_K gotoblas -> dsum_k | |||||
#define DSWAP_K gotoblas -> dswap_k | #define DSWAP_K gotoblas -> dswap_k | ||||
#define DROT_K gotoblas -> drot_k | #define DROT_K gotoblas -> drot_k | ||||
@@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); | |||||
double BLASFUNC(dzasum)(blasint *, double *, blasint *); | double BLASFUNC(dzasum)(blasint *, double *, blasint *); | ||||
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); | xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); | ||||
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *); | |||||
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *); | |||||
double BLASFUNC(dsum) (blasint *, double *, blasint *); | |||||
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *); | |||||
double BLASFUNC(dzsum)(blasint *, double *, blasint *); | |||||
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *); | |||||
blasint BLASFUNC(isamax)(blasint *, float *, blasint *); | blasint BLASFUNC(isamax)(blasint *, float *, blasint *); | ||||
blasint BLASFUNC(idamax)(blasint *, double *, blasint *); | blasint BLASFUNC(idamax)(blasint *, double *, blasint *); | ||||
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); | blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); | ||||
@@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG); | |||||
double zasum_k (BLASLONG, double *, BLASLONG); | double zasum_k (BLASLONG, double *, BLASLONG); | ||||
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); | xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); | ||||
float ssum_k (BLASLONG, float *, BLASLONG); | |||||
double dsum_k (BLASLONG, double *, BLASLONG); | |||||
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG); | |||||
float csum_k (BLASLONG, float *, BLASLONG); | |||||
double zsum_k (BLASLONG, double *, BLASLONG); | |||||
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG); | |||||
float samax_k (BLASLONG, float *, BLASLONG); | float samax_k (BLASLONG, float *, BLASLONG); | ||||
double damax_k (BLASLONG, double *, BLASLONG); | double damax_k (BLASLONG, double *, BLASLONG); | ||||
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); | xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); | ||||
@@ -66,6 +66,7 @@ | |||||
#define DOTC_K QDOTC_K | #define DOTC_K QDOTC_K | ||||
#define NRM2_K QNRM2_K | #define NRM2_K QNRM2_K | ||||
#define SCAL_K QSCAL_K | #define SCAL_K QSCAL_K | ||||
#define SUM_K QSUM_K | |||||
#define SWAP_K QSWAP_K | #define SWAP_K QSWAP_K | ||||
#define ROT_K QROT_K | #define ROT_K QROT_K | ||||
@@ -356,6 +357,7 @@ | |||||
#define DOTC_K DDOTC_K | #define DOTC_K DDOTC_K | ||||
#define NRM2_K DNRM2_K | #define NRM2_K DNRM2_K | ||||
#define SCAL_K DSCAL_K | #define SCAL_K DSCAL_K | ||||
#define SUM_K DSUM_K | |||||
#define SWAP_K DSWAP_K | #define SWAP_K DSWAP_K | ||||
#define ROT_K DROT_K | #define ROT_K DROT_K | ||||
@@ -658,6 +660,7 @@ | |||||
#define DOTC_K SDOTC_K | #define DOTC_K SDOTC_K | ||||
#define NRM2_K SNRM2_K | #define NRM2_K SNRM2_K | ||||
#define SCAL_K SSCAL_K | #define SCAL_K SSCAL_K | ||||
#define SUM_K SSUM_K | |||||
#define SWAP_K SSWAP_K | #define SWAP_K SSWAP_K | ||||
#define ROT_K SROT_K | #define ROT_K SROT_K | ||||
@@ -962,6 +965,7 @@ | |||||
#define DOTC_K XDOTC_K | #define DOTC_K XDOTC_K | ||||
#define NRM2_K XNRM2_K | #define NRM2_K XNRM2_K | ||||
#define SCAL_K XSCAL_K | #define SCAL_K XSCAL_K | ||||
#define SUM_K XSUM_K | |||||
#define SWAP_K XSWAP_K | #define SWAP_K XSWAP_K | ||||
#define ROT_K XROT_K | #define ROT_K XROT_K | ||||
@@ -1363,6 +1367,7 @@ | |||||
#define DOTC_K ZDOTC_K | #define DOTC_K ZDOTC_K | ||||
#define NRM2_K ZNRM2_K | #define NRM2_K ZNRM2_K | ||||
#define SCAL_K ZSCAL_K | #define SCAL_K ZSCAL_K | ||||
#define SUM_K ZSUM_K | |||||
#define SWAP_K ZSWAP_K | #define SWAP_K ZSWAP_K | ||||
#define ROT_K ZROT_K | #define ROT_K ZROT_K | ||||
@@ -1785,6 +1790,7 @@ | |||||
#define DOTC_K CDOTC_K | #define DOTC_K CDOTC_K | ||||
#define NRM2_K CNRM2_K | #define NRM2_K CNRM2_K | ||||
#define SCAL_K CSCAL_K | #define SCAL_K CSCAL_K | ||||
#define SUM_K CSUM_K | |||||
#define SWAP_K CSWAP_K | #define SWAP_K CSWAP_K | ||||
#define ROT_K CROT_K | #define ROT_K CROT_K | ||||
@@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||||
float (*snrm2_k) (BLASLONG, float *, BLASLONG); | float (*snrm2_k) (BLASLONG, float *, BLASLONG); | ||||
float (*sasum_k) (BLASLONG, float *, BLASLONG); | float (*sasum_k) (BLASLONG, float *, BLASLONG); | ||||
float (*ssum_k) (BLASLONG, float *, BLASLONG); | |||||
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
@@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||||
double (*dnrm2_k) (BLASLONG, double *, BLASLONG); | double (*dnrm2_k) (BLASLONG, double *, BLASLONG); | ||||
double (*dasum_k) (BLASLONG, double *, BLASLONG); | double (*dasum_k) (BLASLONG, double *, BLASLONG); | ||||
double (*dsum_k) (BLASLONG, double *, BLASLONG); | |||||
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | ||||
@@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||||
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); | xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); | ||||
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); | xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); | ||||
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); | |||||
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | ||||
@@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||||
float (*cnrm2_k) (BLASLONG, float *, BLASLONG); | float (*cnrm2_k) (BLASLONG, float *, BLASLONG); | ||||
float (*casum_k) (BLASLONG, float *, BLASLONG); | float (*casum_k) (BLASLONG, float *, BLASLONG); | ||||
float (*csum_k) (BLASLONG, float *, BLASLONG); | |||||
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
@@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); | |||||
double (*znrm2_k) (BLASLONG, double *, BLASLONG); | double (*znrm2_k) (BLASLONG, double *, BLASLONG); | ||||
double (*zasum_k) (BLASLONG, double *, BLASLONG); | double (*zasum_k) (BLASLONG, double *, BLASLONG); | ||||
double (*zsum_k) (BLASLONG, double *, BLASLONG); | |||||
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
@@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||||
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); | xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); | ||||
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); | xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); | ||||
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); | |||||
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
@@ -39,7 +39,7 @@ | |||||
#ifndef COMMON_POWER | #ifndef COMMON_POWER | ||||
#define COMMON_POWER | #define COMMON_POWER | ||||
#if defined(POWER8) | |||||
#if defined(POWER8) || defined(POWER9) | |||||
#define MB __asm__ __volatile__ ("eieio":::"memory") | #define MB __asm__ __volatile__ ("eieio":::"memory") | ||||
#define WMB __asm__ __volatile__ ("eieio":::"memory") | #define WMB __asm__ __volatile__ ("eieio":::"memory") | ||||
#else | #else | ||||
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
#define HAVE_PREFETCH | #define HAVE_PREFETCH | ||||
#endif | #endif | ||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) | |||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) ) | |||||
#define DCBT_ARG 0 | #define DCBT_ARG 0 | ||||
#else | #else | ||||
#define DCBT_ARG 8 | #define DCBT_ARG 8 | ||||
@@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
#define L1_PREFETCH dcbtst | #define L1_PREFETCH dcbtst | ||||
#endif | #endif | ||||
#if defined(POWER8) | |||||
#if defined(POWER8) || defined(POWER9) | |||||
#define L1_DUALFETCH | #define L1_DUALFETCH | ||||
#define L1_PREFETCHSIZE (16 + 128 * 100) | #define L1_PREFETCHSIZE (16 + 128 * 100) | ||||
#define L1_PREFETCH dcbtst | #define L1_PREFETCH dcbtst | ||||
@@ -598,9 +598,14 @@ REALNAME:;\ | |||||
#ifndef __64BIT__ | #ifndef __64BIT__ | ||||
#define PROLOGUE \ | #define PROLOGUE \ | ||||
.machine "any";\ | .machine "any";\ | ||||
.toc;\ | |||||
.globl .REALNAME;\ | .globl .REALNAME;\ | ||||
.globl REALNAME;\ | |||||
.csect REALNAME[DS],3;\ | |||||
REALNAME:;\ | |||||
.long .REALNAME, TOC[tc0], 0;\ | |||||
.csect .text[PR],5;\ | .csect .text[PR],5;\ | ||||
.REALNAME:; | |||||
.REALNAME: | |||||
#define EPILOGUE \ | #define EPILOGUE \ | ||||
_section_.text:;\ | _section_.text:;\ | ||||
@@ -611,9 +616,14 @@ _section_.text:;\ | |||||
#define PROLOGUE \ | #define PROLOGUE \ | ||||
.machine "any";\ | .machine "any";\ | ||||
.toc;\ | |||||
.globl .REALNAME;\ | .globl .REALNAME;\ | ||||
.globl REALNAME;\ | |||||
.csect REALNAME[DS],3;\ | |||||
REALNAME:;\ | |||||
.llong .REALNAME, TOC[tc0], 0;\ | |||||
.csect .text[PR], 5;\ | .csect .text[PR], 5;\ | ||||
.REALNAME:; | |||||
.REALNAME: | |||||
#define EPILOGUE \ | #define EPILOGUE \ | ||||
_section_.text:;\ | _section_.text:;\ | ||||
@@ -802,7 +812,7 @@ Lmcount$lazy_ptr: | |||||
#define BUFFER_SIZE ( 2 << 20) | #define BUFFER_SIZE ( 2 << 20) | ||||
#elif defined(PPC440FP2) | #elif defined(PPC440FP2) | ||||
#define BUFFER_SIZE ( 16 << 20) | #define BUFFER_SIZE ( 16 << 20) | ||||
#elif defined(POWER8) | |||||
#elif defined(POWER8) || defined(POWER9) | |||||
#define BUFFER_SIZE ( 64 << 20) | #define BUFFER_SIZE ( 64 << 20) | ||||
#else | #else | ||||
#define BUFFER_SIZE ( 16 << 20) | #define BUFFER_SIZE ( 16 << 20) | ||||
@@ -19,6 +19,7 @@ | |||||
#define QDOTC_K qdot_k | #define QDOTC_K qdot_k | ||||
#define QNRM2_K qnrm2_k | #define QNRM2_K qnrm2_k | ||||
#define QSCAL_K qscal_k | #define QSCAL_K qscal_k | ||||
#define QSUM_K qsum_k | |||||
#define QSWAP_K qswap_k | #define QSWAP_K qswap_k | ||||
#define QROT_K qrot_k | #define QROT_K qrot_k | ||||
@@ -161,6 +162,7 @@ | |||||
#define QDOTC_K gotoblas -> qdot_k | #define QDOTC_K gotoblas -> qdot_k | ||||
#define QNRM2_K gotoblas -> qnrm2_k | #define QNRM2_K gotoblas -> qnrm2_k | ||||
#define QSCAL_K gotoblas -> qscal_k | #define QSCAL_K gotoblas -> qscal_k | ||||
#define QSUM_K gotoblas -> qsum_k | |||||
#define QSWAP_K gotoblas -> qswap_k | #define QSWAP_K gotoblas -> qswap_k | ||||
#define QROT_K gotoblas -> qrot_k | #define QROT_K gotoblas -> qrot_k | ||||
@@ -12,6 +12,7 @@ | |||||
#define ISMAX_K ismax_k | #define ISMAX_K ismax_k | ||||
#define ISMIN_K ismin_k | #define ISMIN_K ismin_k | ||||
#define SASUM_K sasum_k | #define SASUM_K sasum_k | ||||
#define SSUM_K ssum_k | |||||
#define SAXPYU_K saxpy_k | #define SAXPYU_K saxpy_k | ||||
#define SAXPYC_K saxpy_k | #define SAXPYC_K saxpy_k | ||||
#define SCOPY_K scopy_k | #define SCOPY_K scopy_k | ||||
@@ -170,6 +171,7 @@ | |||||
#define ISMAX_K gotoblas -> ismax_k | #define ISMAX_K gotoblas -> ismax_k | ||||
#define ISMIN_K gotoblas -> ismin_k | #define ISMIN_K gotoblas -> ismin_k | ||||
#define SASUM_K gotoblas -> sasum_k | #define SASUM_K gotoblas -> sasum_k | ||||
#define SSUM_K gotoblas -> ssum_k | |||||
#define SAXPYU_K gotoblas -> saxpy_k | #define SAXPYU_K gotoblas -> saxpy_k | ||||
#define SAXPYC_K gotoblas -> saxpy_k | #define SAXPYC_K gotoblas -> saxpy_k | ||||
#define SCOPY_K gotoblas -> scopy_k | #define SCOPY_K gotoblas -> scopy_k | ||||
@@ -19,6 +19,7 @@ | |||||
#define XDOTC_K xdotc_k | #define XDOTC_K xdotc_k | ||||
#define XNRM2_K xnrm2_k | #define XNRM2_K xnrm2_k | ||||
#define XSCAL_K xscal_k | #define XSCAL_K xscal_k | ||||
#define XSUM_K xsum_k | |||||
#define XSWAP_K xswap_k | #define XSWAP_K xswap_k | ||||
#define XROT_K xqrot_k | #define XROT_K xqrot_k | ||||
@@ -227,6 +228,7 @@ | |||||
#define XDOTC_K gotoblas -> xdotc_k | #define XDOTC_K gotoblas -> xdotc_k | ||||
#define XNRM2_K gotoblas -> xnrm2_k | #define XNRM2_K gotoblas -> xnrm2_k | ||||
#define XSCAL_K gotoblas -> xscal_k | #define XSCAL_K gotoblas -> xscal_k | ||||
#define XSUM_K gotoblas -> xsum_k | |||||
#define XSWAP_K gotoblas -> xswap_k | #define XSWAP_K gotoblas -> xswap_k | ||||
#define XROT_K gotoblas -> xqrot_k | #define XROT_K gotoblas -> xqrot_k | ||||
@@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
y = blas_quick_divide_table[y]; | y = blas_quick_divide_table[y]; | ||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y)); | |||||
return result; | return result; | ||||
#endif | #endif | ||||
@@ -134,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ | |||||
"=b" (*ebx), | "=b" (*ebx), | ||||
"=c" (*ecx), | "=c" (*ecx), | ||||
"=d" (*edx) | "=d" (*edx) | ||||
: "0" (op)); | |||||
: "0" (op), "c"(0)); | |||||
#endif | #endif | ||||
} | } | ||||
@@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
y = blas_quick_divide_table[y]; | y = blas_quick_divide_table[y]; | ||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); | |||||
return result; | return result; | ||||
} | } | ||||
@@ -19,6 +19,7 @@ | |||||
#define ZDOTC_K zdotc_k | #define ZDOTC_K zdotc_k | ||||
#define ZNRM2_K znrm2_k | #define ZNRM2_K znrm2_k | ||||
#define ZSCAL_K zscal_k | #define ZSCAL_K zscal_k | ||||
#define ZSUM_K zsum_k | |||||
#define ZSWAP_K zswap_k | #define ZSWAP_K zswap_k | ||||
#define ZROT_K zdrot_k | #define ZROT_K zdrot_k | ||||
@@ -249,6 +250,7 @@ | |||||
#define ZDOTC_K gotoblas -> zdotc_k | #define ZDOTC_K gotoblas -> zdotc_k | ||||
#define ZNRM2_K gotoblas -> znrm2_k | #define ZNRM2_K gotoblas -> znrm2_k | ||||
#define ZSCAL_K gotoblas -> zscal_k | #define ZSCAL_K gotoblas -> zscal_k | ||||
#define ZSUM_K gotoblas -> zsum_k | |||||
#define ZSWAP_K gotoblas -> zswap_k | #define ZSWAP_K gotoblas -> zswap_k | ||||
#define ZROT_K gotoblas -> zdrot_k | #define ZROT_K gotoblas -> zdrot_k | ||||
@@ -53,6 +53,7 @@ | |||||
#define VENDOR_SIS 8 | #define VENDOR_SIS 8 | ||||
#define VENDOR_TRANSMETA 9 | #define VENDOR_TRANSMETA 9 | ||||
#define VENDOR_NSC 10 | #define VENDOR_NSC 10 | ||||
#define VENDOR_HYGON 11 | |||||
#define VENDOR_UNKNOWN 99 | #define VENDOR_UNKNOWN 99 | ||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | ||||
@@ -116,6 +117,7 @@ | |||||
#define CORE_EXCAVATOR 26 | #define CORE_EXCAVATOR 26 | ||||
#define CORE_ZEN 27 | #define CORE_ZEN 27 | ||||
#define CORE_SKYLAKEX 28 | #define CORE_SKYLAKEX 28 | ||||
#define CORE_DHYANA 29 | |||||
#define HAVE_SSE (1 << 0) | #define HAVE_SSE (1 << 0) | ||||
#define HAVE_SSE2 (1 << 1) | #define HAVE_SSE2 (1 << 1) | ||||
@@ -139,6 +141,7 @@ | |||||
#define HAVE_FMA4 (1 << 19) | #define HAVE_FMA4 (1 << 19) | ||||
#define HAVE_FMA3 (1 << 20) | #define HAVE_FMA3 (1 << 20) | ||||
#define HAVE_AVX512VL (1 << 21) | #define HAVE_AVX512VL (1 << 21) | ||||
#define HAVE_AVX2 (1 << 22) | |||||
#define CACHE_INFO_L1_I 1 | #define CACHE_INFO_L1_I 1 | ||||
#define CACHE_INFO_L1_D 2 | #define CACHE_INFO_L1_D 2 | ||||
@@ -214,5 +217,8 @@ typedef struct { | |||||
#define CPUTYPE_EXCAVATOR 50 | #define CPUTYPE_EXCAVATOR 50 | ||||
#define CPUTYPE_ZEN 51 | #define CPUTYPE_ZEN 51 | ||||
#define CPUTYPE_SKYLAKEX 52 | #define CPUTYPE_SKYLAKEX 52 | ||||
#define CPUTYPE_DHYANA 53 | |||||
#define CPUTYPE_HYGON_UNKNOWN 54 | |||||
#endif | #endif |
@@ -39,6 +39,8 @@ | |||||
// Cavium | // Cavium | ||||
#define CPU_THUNDERX 7 | #define CPU_THUNDERX 7 | ||||
#define CPU_THUNDERX2T99 8 | #define CPU_THUNDERX2T99 8 | ||||
//Hisilicon | |||||
#define CPU_TSV110 9 | |||||
static char *cpuname[] = { | static char *cpuname[] = { | ||||
"UNKNOWN", | "UNKNOWN", | ||||
@@ -49,7 +51,8 @@ static char *cpuname[] = { | |||||
"CORTEXA73", | "CORTEXA73", | ||||
"FALKOR", | "FALKOR", | ||||
"THUNDERX", | "THUNDERX", | ||||
"THUNDERX2T99" | |||||
"THUNDERX2T99", | |||||
"TSV110" | |||||
}; | }; | ||||
static char *cpuname_lower[] = { | static char *cpuname_lower[] = { | ||||
@@ -61,7 +64,8 @@ static char *cpuname_lower[] = { | |||||
"cortexa73", | "cortexa73", | ||||
"falkor", | "falkor", | ||||
"thunderx", | "thunderx", | ||||
"thunderx2t99" | |||||
"thunderx2t99", | |||||
"tsv110" | |||||
}; | }; | ||||
int get_feature(char *search) | int get_feature(char *search) | ||||
@@ -145,6 +149,9 @@ int detect(void) | |||||
return CPU_THUNDERX; | return CPU_THUNDERX; | ||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) | else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) | ||||
return CPU_THUNDERX2T99; | return CPU_THUNDERX2T99; | ||||
// HiSilicon | |||||
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) | |||||
return CPU_TSV110; | |||||
} | } | ||||
p = (char *) NULL ; | p = (char *) NULL ; | ||||
@@ -286,6 +293,21 @@ void get_cpuconfig(void) | |||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | ||||
printf("#define DTB_SIZE 4096 \n"); | printf("#define DTB_SIZE 4096 \n"); | ||||
break; | break; | ||||
case CPU_TSV110: | |||||
printf("#define TSV110 \n"); | |||||
printf("#define L1_CODE_SIZE 65536 \n"); | |||||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||||
printf("#define L1_CODE_ASSOCIATIVE 4 \n"); | |||||
printf("#define L1_DATA_SIZE 65536 \n"); | |||||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||||
printf("#define L1_DATA_ASSOCIATIVE 4 \n"); | |||||
printf("#define L2_SIZE 524228 \n"); | |||||
printf("#define L2_LINESIZE 64 \n"); | |||||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||||
printf("#define DTB_SIZE 4096 \n"); | |||||
break; | |||||
} | } | ||||
} | } | ||||
@@ -94,7 +94,7 @@ char *corename[] = { | |||||
"CELL", | "CELL", | ||||
"PPCG4", | "PPCG4", | ||||
"POWER8", | "POWER8", | ||||
"POWER8" | |||||
"POWER9" | |||||
}; | }; | ||||
int detect(void){ | int detect(void){ | ||||
@@ -124,7 +124,7 @@ int detect(void){ | |||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | ||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | ||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | ||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; | |||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | ||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | ||||
@@ -156,7 +156,7 @@ int detect(void){ | |||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | ||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | ||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | ||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; | |||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | ||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | ||||
return CPUTYPE_POWER5; | return CPUTYPE_POWER5; | ||||
@@ -180,7 +180,7 @@ int id; | |||||
__asm __volatile("mfpvr %0" : "=r"(id)); | __asm __volatile("mfpvr %0" : "=r"(id)); | ||||
switch ( id >> 16 ) { | switch ( id >> 16 ) { | ||||
case 0x4e: // POWER9 | case 0x4e: // POWER9 | ||||
return CPUTYPE_POWER8; | |||||
return CPUTYPE_POWER9; | |||||
break; | break; | ||||
case 0x4d: | case 0x4d: | ||||
case 0x4b: // POWER8/8E | case 0x4b: // POWER8/8E | ||||
@@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ | |||||
("mov %%ebx, %%edi;" | ("mov %%ebx, %%edi;" | ||||
"cpuid;" | "cpuid;" | ||||
"xchgl %%ebx, %%edi;" | "xchgl %%ebx, %%edi;" | ||||
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); | |||||
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc"); | |||||
#else | #else | ||||
__asm__ __volatile__ | __asm__ __volatile__ | ||||
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); | |||||
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc"); | |||||
#endif | #endif | ||||
} | } | ||||
@@ -211,6 +211,44 @@ int support_avx(){ | |||||
#endif | #endif | ||||
} | } | ||||
int support_avx2(){ | |||||
#ifndef NO_AVX2 | |||||
int eax, ebx, ecx=0, edx; | |||||
int ret=0; | |||||
if (!support_avx()) | |||||
return 0; | |||||
cpuid(7, &eax, &ebx, &ecx, &edx); | |||||
if((ebx & (1<<7)) != 0) | |||||
ret=1; //OS supports AVX2 | |||||
return ret; | |||||
#else | |||||
return 0; | |||||
#endif | |||||
} | |||||
int support_avx512(){ | |||||
#if !defined(NO_AVX) && !defined(NO_AVX512) | |||||
int eax, ebx, ecx, edx; | |||||
int ret=0; | |||||
if (!support_avx()) | |||||
return 0; | |||||
cpuid(7, &eax, &ebx, &ecx, &edx); | |||||
if((ebx & 32) != 32){ | |||||
ret=0; //OS does not even support AVX2 | |||||
} | |||||
if((ebx & (1<<31)) != 0){ | |||||
xgetbv(0, &eax, &edx); | |||||
if((eax & 0xe0) == 0xe0) | |||||
ret=1; //OS supports AVX512VL | |||||
} | |||||
return ret; | |||||
#else | |||||
return 0; | |||||
#endif | |||||
} | |||||
int get_vendor(void){ | int get_vendor(void){ | ||||
int eax, ebx, ecx, edx; | int eax, ebx, ecx, edx; | ||||
@@ -233,6 +271,7 @@ int get_vendor(void){ | |||||
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; | if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; | ||||
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; | if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; | ||||
if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; | if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; | ||||
if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON; | |||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | ||||
@@ -294,6 +333,8 @@ int get_cputype(int gettype){ | |||||
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | ||||
#ifndef NO_AVX | #ifndef NO_AVX | ||||
if (support_avx()) feature |= HAVE_AVX; | if (support_avx()) feature |= HAVE_AVX; | ||||
if (support_avx2()) feature |= HAVE_AVX2; | |||||
if (support_avx512()) feature |= HAVE_AVX512VL; | |||||
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; | if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; | ||||
#endif | #endif | ||||
@@ -1006,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||||
} | } | ||||
} | } | ||||
if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { | |||||
if ((get_vendor() == VENDOR_AMD) || | |||||
(get_vendor() == VENDOR_HYGON) || | |||||
(get_vendor() == VENDOR_CENTAUR)) { | |||||
cpuid(0x80000005, &eax, &ebx, &ecx, &edx); | cpuid(0x80000005, &eax, &ebx, &ecx, &edx); | ||||
LDTB.size = 4096; | LDTB.size = 4096; | ||||
@@ -1228,22 +1271,18 @@ int get_cpuname(void){ | |||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
case 12: | case 12: | ||||
case 15: | case 15: | ||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
#else | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
#endif | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
case 13: | case 13: | ||||
//Broadwell | //Broadwell | ||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
#else | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
#endif | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
} | } | ||||
@@ -1252,33 +1291,27 @@ int get_cpuname(void){ | |||||
switch (model) { | switch (model) { | ||||
case 5: | case 5: | ||||
case 6: | case 6: | ||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
#else | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
#endif | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
case 7: | case 7: | ||||
case 15: | case 15: | ||||
//Broadwell | //Broadwell | ||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
#else | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
#endif | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
case 14: | case 14: | ||||
//Skylake | //Skylake | ||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
#else | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
#endif | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
case 12: | case 12: | ||||
@@ -1292,80 +1325,66 @@ int get_cpuname(void){ | |||||
switch (model) { | switch (model) { | ||||
case 6: | case 6: | ||||
//Broadwell | //Broadwell | ||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
#else | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
#endif | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
case 5: | case 5: | ||||
// Skylake X | // Skylake X | ||||
#ifndef NO_AVX512 | |||||
return CPUTYPE_SKYLAKEX; | |||||
#else | |||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
return CPUTYPE_HASWELL; | |||||
#else | |||||
return CPUTYPE_SANDYBRIDGE; | |||||
#endif | |||||
if(support_avx512()) | |||||
return CPUTYPE_SKYLAKEX; | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
#endif | |||||
case 14: | case 14: | ||||
// Skylake | // Skylake | ||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
#else | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
#endif | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
case 7: | case 7: | ||||
// Xeon Phi Knights Landing | // Xeon Phi Knights Landing | ||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
#else | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
#endif | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
case 12: | case 12: | ||||
// Apollo Lake | // Apollo Lake | ||||
case 15: | |||||
// Denverton | |||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
} | } | ||||
break; | break; | ||||
case 6: | case 6: | ||||
switch (model) { | switch (model) { | ||||
case 6: // Cannon Lake | case 6: // Cannon Lake | ||||
#ifndef NO_AVX512 | |||||
return CPUTYPE_SKYLAKEX; | |||||
#else | |||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
return CPUTYPE_HASWELL; | |||||
#else | |||||
return CPUTYPE_SANDYBRIDGE; | |||||
#endif | |||||
if(support_avx512()) | |||||
return CPUTYPE_SKYLAKEX; | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
#endif | |||||
} | } | ||||
break; | break; | ||||
case 9: | case 9: | ||||
case 8: | |||||
case 8: | |||||
switch (model) { | switch (model) { | ||||
case 14: // Kaby Lake | |||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
case 14: // Kaby Lake and refreshes | |||||
if(support_avx2()) | |||||
return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
#else | |||||
if(support_avx()) | |||||
return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
#endif | |||||
else | else | ||||
return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
} | } | ||||
@@ -1469,6 +1488,26 @@ int get_cpuname(void){ | |||||
return CPUTYPE_AMD_UNKNOWN; | return CPUTYPE_AMD_UNKNOWN; | ||||
} | } | ||||
if (vendor == VENDOR_HYGON){ | |||||
switch (family) { | |||||
case 0xf: | |||||
switch (exfamily) { | |||||
case 9: | |||||
//Hygon Dhyana | |||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
return CPUTYPE_ZEN; | |||||
#else | |||||
return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator | |||||
#endif | |||||
else | |||||
return CPUTYPE_BARCELONA; | |||||
} | |||||
break; | |||||
} | |||||
return CPUTYPE_HYGON_UNKNOWN; | |||||
} | |||||
if (vendor == VENDOR_CYRIX){ | if (vendor == VENDOR_CYRIX){ | ||||
switch (family) { | switch (family) { | ||||
case 0x4: | case 0x4: | ||||
@@ -1590,7 +1629,8 @@ static char *cpuname[] = { | |||||
"STEAMROLLER", | "STEAMROLLER", | ||||
"EXCAVATOR", | "EXCAVATOR", | ||||
"ZEN", | "ZEN", | ||||
"SKYLAKEX" | |||||
"SKYLAKEX", | |||||
"DHYANA" | |||||
}; | }; | ||||
static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
@@ -1645,7 +1685,8 @@ static char *lowercpuname[] = { | |||||
"steamroller", | "steamroller", | ||||
"excavator", | "excavator", | ||||
"zen", | "zen", | ||||
"skylakex" | |||||
"skylakex", | |||||
"dhyana" | |||||
}; | }; | ||||
static char *corename[] = { | static char *corename[] = { | ||||
@@ -1677,7 +1718,8 @@ static char *corename[] = { | |||||
"STEAMROLLER", | "STEAMROLLER", | ||||
"EXCAVATOR", | "EXCAVATOR", | ||||
"ZEN", | "ZEN", | ||||
"SKYLAKEX" | |||||
"SKYLAKEX", | |||||
"DHYANA" | |||||
}; | }; | ||||
static char *corename_lower[] = { | static char *corename_lower[] = { | ||||
@@ -1709,7 +1751,8 @@ static char *corename_lower[] = { | |||||
"steamroller", | "steamroller", | ||||
"excavator", | "excavator", | ||||
"zen", | "zen", | ||||
"skylakex" | |||||
"skylakex", | |||||
"dhyana" | |||||
}; | }; | ||||
@@ -2026,6 +2069,23 @@ int get_coretype(void){ | |||||
} | } | ||||
} | } | ||||
if (vendor == VENDOR_HYGON){ | |||||
if (family == 0xf){ | |||||
if (exfamily == 9) { | |||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
return CORE_ZEN; | |||||
#else | |||||
return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator | |||||
#endif | |||||
else | |||||
return CORE_BARCELONA; | |||||
} else { | |||||
return CORE_BARCELONA; | |||||
} | |||||
} | |||||
} | |||||
if (vendor == VENDOR_CENTAUR) { | if (vendor == VENDOR_CENTAUR) { | ||||
switch (family) { | switch (family) { | ||||
case 0x6: | case 0x6: | ||||
@@ -2112,6 +2172,8 @@ void get_cpuconfig(void){ | |||||
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | ||||
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | ||||
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | ||||
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); | |||||
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); | |||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | ||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | ||||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | ||||
@@ -2180,6 +2242,8 @@ void get_sse(void){ | |||||
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | ||||
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | ||||
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | ||||
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); | |||||
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); | |||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | ||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | ||||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | ||||
@@ -27,9 +27,9 @@ | |||||
#include <string.h> | #include <string.h> | ||||
#define CPU_GENERIC 0 | |||||
#define CPU_Z13 1 | |||||
#define CPU_Z14 2 | |||||
#define CPU_GENERIC 0 | |||||
#define CPU_Z13 1 | |||||
#define CPU_Z14 2 | |||||
static char *cpuname[] = { | static char *cpuname[] = { | ||||
"ZARCH_GENERIC", | "ZARCH_GENERIC", | ||||
@@ -64,10 +64,8 @@ int detect(void) | |||||
if (strstr(p, "2964")) return CPU_Z13; | if (strstr(p, "2964")) return CPU_Z13; | ||||
if (strstr(p, "2965")) return CPU_Z13; | if (strstr(p, "2965")) return CPU_Z13; | ||||
/* detect z14, but fall back to z13 */ | |||||
if (strstr(p, "3906")) return CPU_Z13; | |||||
if (strstr(p, "3907")) return CPU_Z13; | |||||
if (strstr(p, "3906")) return CPU_Z14; | |||||
if (strstr(p, "3907")) return CPU_Z14; | |||||
return CPU_GENERIC; | return CPU_GENERIC; | ||||
} | } | ||||
@@ -116,7 +114,14 @@ void get_cpuconfig(void) | |||||
break; | break; | ||||
case CPU_Z14: | case CPU_Z14: | ||||
printf("#define Z14\n"); | printf("#define Z14\n"); | ||||
printf("#define L1_DATA_SIZE 131072\n"); | |||||
printf("#define L1_DATA_LINESIZE 256\n"); | |||||
printf("#define L1_DATA_ASSOCIATIVE 8\n"); | |||||
printf("#define L2_SIZE 4194304\n"); | |||||
printf("#define L2_LINESIZE 256\n"); | |||||
printf("#define L2_ASSOCIATIVE 8\n"); | |||||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | printf("#define DTB_DEFAULT_ENTRIES 64\n"); | ||||
printf("#define DTB_SIZE 4096\n"); | |||||
break; | break; | ||||
} | } | ||||
} | } |
@@ -113,7 +113,7 @@ ARCH_X86 | |||||
ARCH_X86_64 | ARCH_X86_64 | ||||
#endif | #endif | ||||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) | |||||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__) | |||||
ARCH_POWER | ARCH_POWER | ||||
#endif | #endif | ||||
@@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | ||||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | ||||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||||
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; | |||||
queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
queue[num_cpu].routine = trmv_kernel; | queue[num_cpu].routine = trmv_kernel; | ||||
@@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | range_m[num_cpu + 1] = range_m[num_cpu] + width; | ||||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | ||||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||||
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; | |||||
queue[num_cpu].mode = mode; | queue[num_cpu].mode = mode; | ||||
queue[num_cpu].routine = trmv_kernel; | queue[num_cpu].routine = trmv_kernel; | ||||
@@ -18,8 +18,12 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||||
ifeq ($(ARCH),arm64) | ifeq ($(ARCH),arm64) | ||||
COMMONOBJS += dynamic_arm64.$(SUFFIX) | COMMONOBJS += dynamic_arm64.$(SUFFIX) | ||||
else | else | ||||
ifeq ($(ARCH),power) | |||||
COMMONOBJS += dynamic_power.$(SUFFIX) | |||||
else | |||||
COMMONOBJS += dynamic.$(SUFFIX) | COMMONOBJS += dynamic.$(SUFFIX) | ||||
endif | endif | ||||
endif | |||||
else | else | ||||
COMMONOBJS += parameter.$(SUFFIX) | COMMONOBJS += parameter.$(SUFFIX) | ||||
endif | endif | ||||
@@ -78,8 +82,12 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||||
ifeq ($(ARCH),arm64) | ifeq ($(ARCH),arm64) | ||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) | ||||
else | else | ||||
ifeq ($(ARCH),power) | |||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX) | |||||
else | |||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | ||||
endif | endif | ||||
endif | |||||
else | else | ||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | ||||
endif | endif | ||||
@@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||||
SetEvent(pool.killed); | SetEvent(pool.killed); | ||||
for(i = 0; i < blas_num_threads - 1; i++){ | for(i = 0; i < blas_num_threads - 1; i++){ | ||||
// Could also just use WaitForMultipleObjects | |||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE); | WaitForSingleObject(blas_threads[i], 5); //INFINITE); | ||||
#ifndef OS_WINDOWSSTORE | #ifndef OS_WINDOWSSTORE | ||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP | // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP | ||||
TerminateThread(blas_threads[i],0); | TerminateThread(blas_threads[i],0); | ||||
#endif | #endif | ||||
CloseHandle(blas_threads[i]); | |||||
} | } | ||||
CloseHandle(pool.filled); | |||||
CloseHandle(pool.killed); | |||||
blas_server_avail = 0; | blas_server_avail = 0; | ||||
} | } | ||||
@@ -274,6 +274,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; | |||||
#define VENDOR_INTEL 1 | #define VENDOR_INTEL 1 | ||||
#define VENDOR_AMD 2 | #define VENDOR_AMD 2 | ||||
#define VENDOR_CENTAUR 3 | #define VENDOR_CENTAUR 3 | ||||
#define VENDOR_HYGON 4 | |||||
#define VENDOR_UNKNOWN 99 | #define VENDOR_UNKNOWN 99 | ||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | ||||
@@ -304,9 +305,49 @@ int support_avx(){ | |||||
#endif | #endif | ||||
} | } | ||||
int support_avx2(){ | |||||
#ifndef NO_AVX2 | |||||
int eax, ebx, ecx=0, edx; | |||||
int ret=0; | |||||
if (!support_avx()) | |||||
return 0; | |||||
cpuid(7, &eax, &ebx, &ecx, &edx); | |||||
if((ebx & (1<<7)) != 0) | |||||
ret=1; //OS supports AVX2 | |||||
return ret; | |||||
#else | |||||
return 0; | |||||
#endif | |||||
} | |||||
int support_avx512(){ | |||||
#if !defined(NO_AVX) && !defined(NO_AVX512) | |||||
int eax, ebx, ecx, edx; | |||||
int ret=0; | |||||
if (!support_avx()) | |||||
return 0; | |||||
cpuid(7, &eax, &ebx, &ecx, &edx); | |||||
if((ebx & (1<<7)) != 1){ | |||||
ret=0; //OS does not even support AVX2 | |||||
} | |||||
if((ebx & (1<<31)) != 0){ | |||||
xgetbv(0, &eax, &edx); | |||||
if((eax & 0xe0) == 0xe0) | |||||
ret=1; //OS supports AVX512VL | |||||
} | |||||
return ret; | |||||
#else | |||||
return 0; | |||||
#endif | |||||
} | |||||
extern void openblas_warning(int verbose, const char * msg); | extern void openblas_warning(int verbose, const char * msg); | ||||
#define FALLBACK_VERBOSE 1 | #define FALLBACK_VERBOSE 1 | ||||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" | #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" | ||||
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" | |||||
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" | |||||
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" | #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" | ||||
static int get_vendor(void){ | static int get_vendor(void){ | ||||
@@ -329,6 +370,7 @@ static int get_vendor(void){ | |||||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; | if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; | ||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; | if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; | ||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; | if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; | ||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; | |||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | ||||
@@ -403,18 +445,24 @@ static gotoblas_t *get_coretype(void){ | |||||
} | } | ||||
//Intel Haswell | //Intel Haswell | ||||
if (model == 12 || model == 15) { | if (model == 12 || model == 15) { | ||||
if(support_avx()) | |||||
if(support_avx2()) | |||||
return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
else{ | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
} | } | ||||
} | } | ||||
//Intel Broadwell | //Intel Broadwell | ||||
if (model == 13) { | if (model == 13) { | ||||
if(support_avx()) | |||||
if(support_avx2()) | |||||
return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
else{ | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
} | } | ||||
@@ -424,27 +472,36 @@ static gotoblas_t *get_coretype(void){ | |||||
case 4: | case 4: | ||||
//Intel Haswell | //Intel Haswell | ||||
if (model == 5 || model == 6) { | if (model == 5 || model == 6) { | ||||
if(support_avx()) | |||||
if(support_avx2()) | |||||
return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
else{ | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
} | } | ||||
} | } | ||||
//Intel Broadwell | //Intel Broadwell | ||||
if (model == 7 || model == 15) { | if (model == 7 || model == 15) { | ||||
if(support_avx()) | |||||
if(support_avx2()) | |||||
return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
else{ | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
} | } | ||||
} | } | ||||
//Intel Skylake | //Intel Skylake | ||||
if (model == 14) { | if (model == 14) { | ||||
if(support_avx()) | |||||
if(support_avx2()) | |||||
return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
else{ | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
} | } | ||||
@@ -457,72 +514,86 @@ static gotoblas_t *get_coretype(void){ | |||||
case 5: | case 5: | ||||
//Intel Broadwell | //Intel Broadwell | ||||
if (model == 6) { | if (model == 6) { | ||||
if(support_avx()) | |||||
if(support_avx2()) | |||||
return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
else{ | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
} | } | ||||
} | } | ||||
if (model == 5) { | if (model == 5) { | ||||
// Intel Skylake X | // Intel Skylake X | ||||
#ifndef NO_AVX512 | |||||
return &gotoblas_SKYLAKEX; | |||||
#else | |||||
if(support_avx()) | |||||
if (support_avx512()) | |||||
return &gotoblas_SKYLAKEX; | |||||
if(support_avx2()){ | |||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||||
return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||||
return &gotoblas_NEHALEM; | |||||
} | |||||
#endif | |||||
} | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||||
return &gotoblas_NEHALEM; | |||||
} | |||||
} | } | ||||
//Intel Skylake | //Intel Skylake | ||||
if (model == 14) { | if (model == 14) { | ||||
if(support_avx()) | |||||
if(support_avx2()) | |||||
return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
else{ | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
} | } | ||||
} | } | ||||
//Intel Phi Knights Landing | //Intel Phi Knights Landing | ||||
if (model == 7) { | if (model == 7) { | ||||
if(support_avx()) | |||||
if(support_avx2()){ | |||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||||
return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
else{ | |||||
} | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
} | } | ||||
} | } | ||||
//Apollo Lake | |||||
if (model == 12) { | |||||
//Apollo Lake or Denverton | |||||
if (model == 12 || model == 15) { | |||||
return &gotoblas_NEHALEM; | return &gotoblas_NEHALEM; | ||||
} | } | ||||
return NULL; | return NULL; | ||||
case 6: | case 6: | ||||
if (model == 6) { | if (model == 6) { | ||||
// Cannon Lake | // Cannon Lake | ||||
#ifndef NO_AVX512 | |||||
return &gotoblas_SKYLAKEX; | |||||
#else | |||||
if(support_avx()) | |||||
#ifndef NO_AVX2 | |||||
return &gotoblas_HASWELL; | |||||
#else | |||||
return &gotoblas_SANDYBRIDGE; | |||||
#endif | |||||
else | |||||
return &gotoblas_NEHALEM; | |||||
#endif | |||||
if(support_avx2()) | |||||
return &gotoblas_HASWELL; | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||||
return &gotoblas_NEHALEM; | |||||
} | |||||
} | } | ||||
return NULL; | return NULL; | ||||
case 9: | case 9: | ||||
case 8: | case 8: | ||||
if (model == 14 ) { // Kaby Lake | if (model == 14 ) { // Kaby Lake | ||||
if(support_avx()) | |||||
if(support_avx2()) | |||||
return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
else{ | |||||
if(support_avx()) { | |||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
return &gotoblas_SANDYBRIDGE; | |||||
} else { | |||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
} | } | ||||
@@ -535,7 +606,7 @@ static gotoblas_t *get_coretype(void){ | |||||
} | } | ||||
} | } | ||||
if (vendor == VENDOR_AMD){ | |||||
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ | |||||
if (family <= 0xe) { | if (family <= 0xe) { | ||||
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon | // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon | ||||
cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | ||||
@@ -615,6 +686,13 @@ static gotoblas_t *get_coretype(void){ | |||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | ||||
} | } | ||||
} | } | ||||
} else if (exfamily == 9) { | |||||
if(support_avx()) | |||||
return &gotoblas_ZEN; | |||||
else{ | |||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||||
} | |||||
}else { | }else { | ||||
return &gotoblas_BARCELONA; | return &gotoblas_BARCELONA; | ||||
} | } | ||||
@@ -0,0 +1,102 @@ | |||||
#include "common.h" | |||||
extern gotoblas_t gotoblas_POWER6; | |||||
extern gotoblas_t gotoblas_POWER8; | |||||
extern gotoblas_t gotoblas_POWER9; | |||||
extern void openblas_warning(int verbose, const char *msg); | |||||
static char *corename[] = { | |||||
"unknown", | |||||
"POWER6", | |||||
"POWER8", | |||||
"POWER9" | |||||
}; | |||||
#define NUM_CORETYPES 4 | |||||
char *gotoblas_corename(void) { | |||||
if (gotoblas == &gotoblas_POWER6) return corename[1]; | |||||
if (gotoblas == &gotoblas_POWER8) return corename[2]; | |||||
if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||||
return corename[0]; | |||||
} | |||||
static gotoblas_t *get_coretype(void) { | |||||
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) | |||||
return &gotoblas_POWER6; | |||||
if (__builtin_cpu_is("power8")) | |||||
return &gotoblas_POWER8; | |||||
if (__builtin_cpu_is("power9")) | |||||
return &gotoblas_POWER9; | |||||
return NULL; | |||||
} | |||||
static gotoblas_t *force_coretype(char * coretype) { | |||||
int i ; | |||||
int found = -1; | |||||
char message[128]; | |||||
for ( i = 0 ; i < NUM_CORETYPES; i++) | |||||
{ | |||||
if (!strncasecmp(coretype, corename[i], 20)) | |||||
{ | |||||
found = i; | |||||
break; | |||||
} | |||||
} | |||||
switch (found) | |||||
{ | |||||
case 1: return (&gotoblas_POWER6); | |||||
case 2: return (&gotoblas_POWER8); | |||||
case 3: return (&gotoblas_POWER9); | |||||
default: return NULL; | |||||
} | |||||
snprintf(message, 128, "Core not found: %s\n", coretype); | |||||
openblas_warning(1, message); | |||||
} | |||||
void gotoblas_dynamic_init(void) { | |||||
char coremsg[128]; | |||||
char coren[22]; | |||||
char *p; | |||||
if (gotoblas) return; | |||||
p = getenv("OPENBLAS_CORETYPE"); | |||||
if ( p ) | |||||
{ | |||||
gotoblas = force_coretype(p); | |||||
} | |||||
else | |||||
{ | |||||
gotoblas = get_coretype(); | |||||
} | |||||
if (gotoblas == NULL) | |||||
{ | |||||
snprintf(coremsg, 128, "Falling back to POWER8 core\n"); | |||||
openblas_warning(1, coremsg); | |||||
gotoblas = &gotoblas_POWER8; | |||||
} | |||||
if (gotoblas && gotoblas -> init) { | |||||
strncpy(coren,gotoblas_corename(),20); | |||||
sprintf(coremsg, "Core: %s\n",coren); | |||||
openblas_warning(2, coremsg); | |||||
gotoblas -> init(); | |||||
} else { | |||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||||
exit(1); | |||||
} | |||||
} | |||||
void gotoblas_dynamic_quit(void) { | |||||
gotoblas = NULL; | |||||
} |
@@ -198,45 +198,68 @@ int get_num_procs(void); | |||||
#else | #else | ||||
int get_num_procs(void) { | int get_num_procs(void) { | ||||
static int nums = 0; | static int nums = 0; | ||||
cpu_set_t *cpusetp; | |||||
size_t size; | |||||
int ret; | |||||
int i,n; | |||||
cpu_set_t cpuset,*cpusetp; | |||||
size_t size; | |||||
int ret; | |||||
#if defined(__GLIBC_PREREQ) | |||||
#if !__GLIBC_PREREQ(2, 7) | |||||
int i; | |||||
#if !__GLIBC_PREREQ(2, 6) | |||||
int n; | |||||
#endif | |||||
#endif | |||||
#endif | |||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | ||||
#if !defined(OS_LINUX) | #if !defined(OS_LINUX) | ||||
return nums; | |||||
return nums; | |||||
#endif | #endif | ||||
#if !defined(__GLIBC_PREREQ) | #if !defined(__GLIBC_PREREQ) | ||||
return nums; | |||||
return nums; | |||||
#else | #else | ||||
#if !__GLIBC_PREREQ(2, 3) | #if !__GLIBC_PREREQ(2, 3) | ||||
return nums; | |||||
return nums; | |||||
#endif | #endif | ||||
#if !__GLIBC_PREREQ(2, 7) | #if !__GLIBC_PREREQ(2, 7) | ||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||||
if (ret!=0) return nums; | if (ret!=0) return nums; | ||||
n=0; | n=0; | ||||
#if !__GLIBC_PREREQ(2, 6) | #if !__GLIBC_PREREQ(2, 6) | ||||
for (i=0;i<nums;i++) | for (i=0;i<nums;i++) | ||||
if (CPU_ISSET(i,cpusetp)) n++; | |||||
if (CPU_ISSET(i,cpuset)) n++; | |||||
nums=n; | nums=n; | ||||
#else | #else | ||||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||||
#endif | #endif | ||||
return nums; | return nums; | ||||
#else | #else | ||||
cpusetp = CPU_ALLOC(nums); | |||||
if (cpusetp == NULL) return nums; | |||||
size = CPU_ALLOC_SIZE(nums); | |||||
ret = sched_getaffinity(0,size,cpusetp); | |||||
if (ret!=0) return nums; | |||||
ret = CPU_COUNT_S(size,cpusetp); | |||||
if (ret > 0 && ret < nums) nums = ret; | |||||
CPU_FREE(cpusetp); | |||||
return nums; | |||||
if (nums >= CPU_SETSIZE) { | |||||
cpusetp = CPU_ALLOC(nums); | |||||
if (cpusetp == NULL) { | |||||
return nums; | |||||
} | |||||
size = CPU_ALLOC_SIZE(nums); | |||||
ret = sched_getaffinity(0,size,cpusetp); | |||||
if (ret!=0) { | |||||
CPU_FREE(cpusetp); | |||||
return nums; | |||||
} | |||||
ret = CPU_COUNT_S(size,cpusetp); | |||||
if (ret > 0 && ret < nums) nums = ret; | |||||
CPU_FREE(cpusetp); | |||||
return nums; | |||||
} else { | |||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||||
if (ret!=0) { | |||||
return nums; | |||||
} | |||||
ret = CPU_COUNT(&cpuset); | |||||
if (ret > 0 && ret < nums) nums = ret; | |||||
return nums; | |||||
} | |||||
#endif | #endif | ||||
#endif | #endif | ||||
} | } | ||||
@@ -1073,11 +1096,6 @@ static volatile int memory_initialized = 0; | |||||
} | } | ||||
free(table); | free(table); | ||||
} | } | ||||
#if defined(OS_WINDOWS) | |||||
TlsFree(local_storage_key); | |||||
#else | |||||
pthread_key_delete(local_storage_key); | |||||
#endif | |||||
} | } | ||||
static void blas_memory_init(){ | static void blas_memory_init(){ | ||||
@@ -1295,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) { | |||||
free(map_address); | free(map_address); | ||||
} | } | ||||
#ifdef SMP | |||||
void blas_thread_memory_cleanup(void) { | |||||
blas_memory_cleanup((void*)get_memory_table()); | |||||
} | |||||
#endif | |||||
void blas_shutdown(void){ | void blas_shutdown(void){ | ||||
#ifdef SMP | #ifdef SMP | ||||
BLASFUNC(blas_thread_shutdown)(); | BLASFUNC(blas_thread_shutdown)(); | ||||
@@ -1304,7 +1329,7 @@ void blas_shutdown(void){ | |||||
/* Only cleanupIf we were built for threading and TLS was initialized */ | /* Only cleanupIf we were built for threading and TLS was initialized */ | ||||
if (local_storage_key) | if (local_storage_key) | ||||
#endif | #endif | ||||
blas_memory_cleanup((void*)get_memory_table()); | |||||
blas_thread_memory_cleanup(); | |||||
#ifdef SEEK_ADDRESS | #ifdef SEEK_ADDRESS | ||||
base_address = 0UL; | base_address = 0UL; | ||||
@@ -1491,6 +1516,14 @@ void DESTRUCTOR gotoblas_quit(void) { | |||||
blas_shutdown(); | blas_shutdown(); | ||||
#if defined(SMP) | |||||
#if defined(OS_WINDOWS) | |||||
TlsFree(local_storage_key); | |||||
#else | |||||
pthread_key_delete(local_storage_key); | |||||
#endif | |||||
#endif | |||||
#ifdef PROFILE | #ifdef PROFILE | ||||
moncontrol (0); | moncontrol (0); | ||||
#endif | #endif | ||||
@@ -1526,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser | |||||
break; | break; | ||||
case DLL_THREAD_DETACH: | case DLL_THREAD_DETACH: | ||||
#if defined(SMP) | #if defined(SMP) | ||||
blas_memory_cleanup((void*)get_memory_table()); | |||||
blas_thread_memory_cleanup(); | |||||
#endif | #endif | ||||
break; | break; | ||||
case DLL_PROCESS_DETACH: | case DLL_PROCESS_DETACH: | ||||
@@ -1600,9 +1633,11 @@ void gotoblas_dummy_for_PGI(void) { | |||||
#endif | #endif | ||||
#else | #else | ||||
/* USE_TLS / COMPILE_TLS not set */ | |||||
#include <errno.h> | #include <errno.h> | ||||
#ifdef OS_WINDOWS | |||||
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) | |||||
#define ALLOC_WINDOWS | #define ALLOC_WINDOWS | ||||
#ifndef MEM_LARGE_PAGES | #ifndef MEM_LARGE_PAGES | ||||
#define MEM_LARGE_PAGES 0x20000000 | #define MEM_LARGE_PAGES 0x20000000 | ||||
@@ -1616,7 +1651,7 @@ void gotoblas_dummy_for_PGI(void) { | |||||
#include <stdio.h> | #include <stdio.h> | ||||
#include <fcntl.h> | #include <fcntl.h> | ||||
#ifndef OS_WINDOWS | |||||
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) | |||||
#include <sys/mman.h> | #include <sys/mman.h> | ||||
#ifndef NO_SYSV_IPC | #ifndef NO_SYSV_IPC | ||||
#include <sys/shm.h> | #include <sys/shm.h> | ||||
@@ -1636,7 +1671,7 @@ void gotoblas_dummy_for_PGI(void) { | |||||
#include <sys/resource.h> | #include <sys/resource.h> | ||||
#endif | #endif | ||||
#if defined(OS_FREEBSD) || defined(OS_DARWIN) | |||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) | |||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <sys/resource.h> | #include <sys/resource.h> | ||||
#endif | #endif | ||||
@@ -1675,9 +1710,12 @@ void gotoblas_dummy_for_PGI(void) { | |||||
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) | #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) | ||||
#define CONSTRUCTOR __attribute__ ((constructor)) | #define CONSTRUCTOR __attribute__ ((constructor)) | ||||
#define DESTRUCTOR __attribute__ ((destructor)) | #define DESTRUCTOR __attribute__ ((destructor)) | ||||
#else | |||||
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) | |||||
#define CONSTRUCTOR __attribute__ ((constructor(101))) | #define CONSTRUCTOR __attribute__ ((constructor(101))) | ||||
#define DESTRUCTOR __attribute__ ((destructor(101))) | #define DESTRUCTOR __attribute__ ((destructor(101))) | ||||
#else | |||||
#define CONSTRUCTOR __attribute__ ((constructor)) | |||||
#define DESTRUCTOR __attribute__ ((destructor)) | |||||
#endif | #endif | ||||
#ifdef DYNAMIC_ARCH | #ifdef DYNAMIC_ARCH | ||||
@@ -1701,45 +1739,70 @@ void goto_set_num_threads(int num_threads) {}; | |||||
int get_num_procs(void); | int get_num_procs(void); | ||||
#else | #else | ||||
int get_num_procs(void) { | int get_num_procs(void) { | ||||
static int nums = 0; | static int nums = 0; | ||||
cpu_set_t *cpusetp; | |||||
size_t size; | |||||
int ret; | |||||
int i,n; | |||||
cpu_set_t cpuset,*cpusetp; | |||||
size_t size; | |||||
int ret; | |||||
#if defined(__GLIBC_PREREQ) | |||||
#if !__GLIBC_PREREQ(2, 7) | |||||
int i; | |||||
#if !__GLIBC_PREREQ(2, 6) | |||||
int n; | |||||
#endif | |||||
#endif | |||||
#endif | |||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | ||||
#if !defined(OS_LINUX) | #if !defined(OS_LINUX) | ||||
return nums; | |||||
return nums; | |||||
#endif | #endif | ||||
#if !defined(__GLIBC_PREREQ) | #if !defined(__GLIBC_PREREQ) | ||||
return nums; | |||||
return nums; | |||||
#else | #else | ||||
#if !__GLIBC_PREREQ(2, 3) | #if !__GLIBC_PREREQ(2, 3) | ||||
return nums; | |||||
return nums; | |||||
#endif | #endif | ||||
#if !__GLIBC_PREREQ(2, 7) | #if !__GLIBC_PREREQ(2, 7) | ||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||||
if (ret!=0) return nums; | if (ret!=0) return nums; | ||||
n=0; | n=0; | ||||
#if !__GLIBC_PREREQ(2, 6) | #if !__GLIBC_PREREQ(2, 6) | ||||
for (i=0;i<nums;i++) | for (i=0;i<nums;i++) | ||||
if (CPU_ISSET(i,cpusetp)) n++; | |||||
if (CPU_ISSET(i,cpuset)) n++; | |||||
nums=n; | nums=n; | ||||
#else | #else | ||||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||||
#endif | #endif | ||||
return nums; | return nums; | ||||
#else | #else | ||||
cpusetp = CPU_ALLOC(nums); | |||||
if (cpusetp == NULL) return nums; | |||||
size = CPU_ALLOC_SIZE(nums); | |||||
ret = sched_getaffinity(0,size,cpusetp); | |||||
if (ret!=0) return nums; | |||||
nums = CPU_COUNT_S(size,cpusetp); | |||||
CPU_FREE(cpusetp); | |||||
return nums; | |||||
if (nums >= CPU_SETSIZE) { | |||||
cpusetp = CPU_ALLOC(nums); | |||||
if (cpusetp == NULL) { | |||||
return nums; | |||||
} | |||||
size = CPU_ALLOC_SIZE(nums); | |||||
ret = sched_getaffinity(0,size,cpusetp); | |||||
if (ret!=0) { | |||||
CPU_FREE(cpusetp); | |||||
return nums; | |||||
} | |||||
ret = CPU_COUNT_S(size,cpusetp); | |||||
if (ret > 0 && ret < nums) nums = ret; | |||||
CPU_FREE(cpusetp); | |||||
return nums; | |||||
} else { | |||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||||
if (ret!=0) { | |||||
return nums; | |||||
} | |||||
ret = CPU_COUNT(&cpuset); | |||||
if (ret > 0 && ret < nums) nums = ret; | |||||
return nums; | |||||
} | |||||
#endif | #endif | ||||
#endif | #endif | ||||
} | } | ||||
@@ -1753,7 +1816,7 @@ int get_num_procs(void) { | |||||
return nums; | return nums; | ||||
} | } | ||||
#endif | #endif | ||||
#ifdef OS_HAIKU | #ifdef OS_HAIKU | ||||
int get_num_procs(void) { | int get_num_procs(void) { | ||||
static int nums = 0; | static int nums = 0; | ||||
@@ -1790,7 +1853,7 @@ int get_num_procs(void) { | |||||
#endif | #endif | ||||
#if defined(OS_FREEBSD) | |||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) | |||||
int get_num_procs(void) { | int get_num_procs(void) { | ||||
@@ -1867,7 +1930,7 @@ void openblas_fork_handler() | |||||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 | // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 | ||||
// In the mean time build with USE_OPENMP=0 or link against another | // In the mean time build with USE_OPENMP=0 or link against another | ||||
// implementation of OpenMP. | // implementation of OpenMP. | ||||
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) | |||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) | |||||
int err; | int err; | ||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); | err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); | ||||
if(err != 0) | if(err != 0) | ||||
@@ -1880,7 +1943,7 @@ extern int openblas_goto_num_threads_env(); | |||||
extern int openblas_omp_num_threads_env(); | extern int openblas_omp_num_threads_env(); | ||||
int blas_get_cpu_number(void){ | int blas_get_cpu_number(void){ | ||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
int max_num; | int max_num; | ||||
#endif | #endif | ||||
int blas_goto_num = 0; | int blas_goto_num = 0; | ||||
@@ -1888,11 +1951,11 @@ int blas_get_cpu_number(void){ | |||||
if (blas_num_threads) return blas_num_threads; | if (blas_num_threads) return blas_num_threads; | ||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
max_num = get_num_procs(); | max_num = get_num_procs(); | ||||
#endif | #endif | ||||
blas_goto_num = 0; | |||||
// blas_goto_num = 0; | |||||
#ifndef USE_OPENMP | #ifndef USE_OPENMP | ||||
blas_goto_num=openblas_num_threads_env(); | blas_goto_num=openblas_num_threads_env(); | ||||
if (blas_goto_num < 0) blas_goto_num = 0; | if (blas_goto_num < 0) blas_goto_num = 0; | ||||
@@ -1904,7 +1967,7 @@ int blas_get_cpu_number(void){ | |||||
#endif | #endif | ||||
blas_omp_num = 0; | |||||
// blas_omp_num = 0; | |||||
blas_omp_num=openblas_omp_num_threads_env(); | blas_omp_num=openblas_omp_num_threads_env(); | ||||
if (blas_omp_num < 0) blas_omp_num = 0; | if (blas_omp_num < 0) blas_omp_num = 0; | ||||
@@ -1912,7 +1975,7 @@ int blas_get_cpu_number(void){ | |||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | ||||
else blas_num_threads = MAX_CPU_NUMBER; | else blas_num_threads = MAX_CPU_NUMBER; | ||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
if (blas_num_threads > max_num) blas_num_threads = max_num; | if (blas_num_threads > max_num) blas_num_threads = max_num; | ||||
#endif | #endif | ||||
@@ -1999,11 +2062,15 @@ static void *alloc_mmap(void *address){ | |||||
} | } | ||||
if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
#endif | |||||
release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
release_info[release_pos].func = alloc_mmap_free; | release_info[release_pos].func = alloc_mmap_free; | ||||
release_pos ++; | release_pos ++; | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
#endif | |||||
} | } | ||||
#ifdef OS_LINUX | #ifdef OS_LINUX | ||||
@@ -2145,14 +2212,18 @@ static void *alloc_mmap(void *address){ | |||||
#if defined(OS_LINUX) && !defined(NO_WARMUP) | #if defined(OS_LINUX) && !defined(NO_WARMUP) | ||||
} | } | ||||
#endif | #endif | ||||
LOCK_COMMAND(&alloc_lock); | |||||
if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
LOCK_COMMAND(&alloc_lock); | |||||
#endif | |||||
release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
release_info[release_pos].func = alloc_mmap_free; | release_info[release_pos].func = alloc_mmap_free; | ||||
release_pos ++; | release_pos ++; | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
UNLOCK_COMMAND(&alloc_lock); | |||||
#endif | |||||
} | } | ||||
UNLOCK_COMMAND(&alloc_lock); | |||||
return map_address; | return map_address; | ||||
} | } | ||||
@@ -2520,7 +2591,7 @@ void *blas_memory_alloc(int procpos){ | |||||
int position; | int position; | ||||
#if defined(WHEREAMI) && !defined(USE_OPENMP) | #if defined(WHEREAMI) && !defined(USE_OPENMP) | ||||
int mypos; | |||||
int mypos = 0; | |||||
#endif | #endif | ||||
void *map_address; | void *map_address; | ||||
@@ -2551,6 +2622,11 @@ void *blas_memory_alloc(int procpos){ | |||||
NULL, | NULL, | ||||
}; | }; | ||||
void *(**func)(void *address); | void *(**func)(void *address); | ||||
#if defined(USE_OPENMP) | |||||
if (!memory_initialized) { | |||||
#endif | |||||
LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
if (!memory_initialized) { | if (!memory_initialized) { | ||||
@@ -2586,6 +2662,9 @@ void *blas_memory_alloc(int procpos){ | |||||
} | } | ||||
UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
#if defined(USE_OPENMP) | |||||
} | |||||
#endif | |||||
#ifdef DEBUG | #ifdef DEBUG | ||||
printf("Alloc Start ...\n"); | printf("Alloc Start ...\n"); | ||||
@@ -2600,13 +2679,17 @@ void *blas_memory_alloc(int procpos){ | |||||
do { | do { | ||||
if (!memory[position].used && (memory[position].pos == mypos)) { | if (!memory[position].used && (memory[position].pos == mypos)) { | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
// blas_lock(&memory[position].lock); | |||||
#else | |||||
blas_lock(&memory[position].lock); | |||||
#endif | |||||
if (!memory[position].used) goto allocation; | if (!memory[position].used) goto allocation; | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
// blas_unlock(&memory[position].lock); | |||||
#else | |||||
blas_unlock(&memory[position].lock); | |||||
#endif | |||||
} | } | ||||
position ++; | position ++; | ||||
@@ -2618,21 +2701,26 @@ void *blas_memory_alloc(int procpos){ | |||||
position = 0; | position = 0; | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
#endif | |||||
do { | do { | ||||
/* if (!memory[position].used) { */ | |||||
/* blas_lock(&memory[position].lock);*/ | |||||
#if defined(USE_OPENMP) | |||||
if (!memory[position].used) { | |||||
blas_lock(&memory[position].lock); | |||||
#endif | |||||
if (!memory[position].used) goto allocation; | if (!memory[position].used) goto allocation; | ||||
/* blas_unlock(&memory[position].lock);*/ | |||||
/* } */ | |||||
#if defined(USE_OPENMP) | |||||
blas_unlock(&memory[position].lock); | |||||
} | |||||
#endif | |||||
position ++; | position ++; | ||||
} while (position < NUM_BUFFERS); | } while (position < NUM_BUFFERS); | ||||
UNLOCK_COMMAND(&alloc_lock); | |||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
UNLOCK_COMMAND(&alloc_lock); | |||||
#endif | |||||
goto error; | goto error; | ||||
allocation : | allocation : | ||||
@@ -2642,10 +2730,11 @@ void *blas_memory_alloc(int procpos){ | |||||
#endif | #endif | ||||
memory[position].used = 1; | memory[position].used = 1; | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
/* blas_unlock(&memory[position].lock);*/ | |||||
#else | |||||
blas_unlock(&memory[position].lock); | |||||
#endif | |||||
if (!memory[position].addr) { | if (!memory[position].addr) { | ||||
do { | do { | ||||
#ifdef DEBUG | #ifdef DEBUG | ||||
@@ -2690,9 +2779,13 @@ void *blas_memory_alloc(int procpos){ | |||||
} while ((BLASLONG)map_address == -1); | } while ((BLASLONG)map_address == -1); | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
#endif | |||||
memory[position].addr = map_address; | memory[position].addr = map_address; | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
#endif | |||||
#ifdef DEBUG | #ifdef DEBUG | ||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); | printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); | ||||
@@ -2746,8 +2839,9 @@ void blas_memory_free(void *free_area){ | |||||
#endif | #endif | ||||
position = 0; | position = 0; | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
#endif | |||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | ||||
position++; | position++; | ||||
@@ -2761,7 +2855,9 @@ void blas_memory_free(void *free_area){ | |||||
WMB; | WMB; | ||||
memory[position].used = 0; | memory[position].used = 0; | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
#endif | |||||
#ifdef DEBUG | #ifdef DEBUG | ||||
printf("Unmap Succeeded.\n\n"); | printf("Unmap Succeeded.\n\n"); | ||||
@@ -2776,8 +2872,9 @@ void blas_memory_free(void *free_area){ | |||||
for (position = 0; position < NUM_BUFFERS; position++) | for (position = 0; position < NUM_BUFFERS; position++) | ||||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); | printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); | ||||
#endif | #endif | ||||
#if defined(SMP) && !defined(USE_OPENMP) | |||||
UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
#endif | |||||
return; | return; | ||||
} | } | ||||
@@ -35,12 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include <string.h> | #include <string.h> | ||||
#if defined(_WIN32) && defined(_MSC_VER) | |||||
#if _MSC_VER < 1900 | |||||
#define snprintf _snprintf | |||||
#endif | |||||
#endif | |||||
static char* openblas_config_str="" | static char* openblas_config_str="" | ||||
"OpenBLAS " | "OpenBLAS " | ||||
VERSION | VERSION | ||||
@@ -141,6 +141,14 @@ else | |||||
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed | $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed | ||||
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c | ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c | ||||
endif | endif | ||||
ifeq ($(F_COMPILER), INTEL) | |||||
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||||
else | |||||
ifneq ($(C_COMPILER), LSB) | ifneq ($(C_COMPILER), LSB) | ||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | ||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \ | -Wl,--whole-archive $< -Wl,--no-whole-archive \ | ||||
@@ -152,6 +160,7 @@ else | |||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \ | -Wl,--whole-archive $< -Wl,--no-whole-archive \ | ||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | ||||
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | ||||
endif | |||||
endif | endif | ||||
rm -f linktest | rm -f linktest | ||||
@@ -40,15 +40,25 @@ | |||||
void gotoblas_init(void); | void gotoblas_init(void); | ||||
void gotoblas_quit(void); | void gotoblas_quit(void); | ||||
#if defined(SMP) && defined(USE_TLS) | |||||
void blas_thread_memory_cleanup(void); | |||||
#endif | |||||
BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { | BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { | ||||
if (reason == DLL_PROCESS_ATTACH) { | |||||
gotoblas_init(); | |||||
} | |||||
if (reason == DLL_PROCESS_DETACH) { | |||||
gotoblas_quit(); | |||||
switch(reason) { | |||||
case DLL_PROCESS_ATTACH: | |||||
gotoblas_init(); | |||||
break; | |||||
case DLL_PROCESS_DETACH: | |||||
gotoblas_quit(); | |||||
break; | |||||
case DLL_THREAD_ATTACH: | |||||
break; | |||||
case DLL_THREAD_DETACH: | |||||
#if defined(SMP) && defined(USE_TLS) | |||||
blas_thread_memory_cleanup(); | |||||
#endif | |||||
break; | |||||
} | } | ||||
return TRUE; | return TRUE; | ||||
@@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include <unistd.h> | #include <unistd.h> | ||||
#endif | #endif | ||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||||
#else | |||||
#define NO_AVX512 | |||||
#endif | |||||
/* #define FORCE_P2 */ | /* #define FORCE_P2 */ | ||||
/* #define FORCE_KATMAI */ | /* #define FORCE_KATMAI */ | ||||
/* #define FORCE_COPPERMINE */ | /* #define FORCE_COPPERMINE */ | ||||
@@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
#ifdef FORCE_SKYLAKEX | #ifdef FORCE_SKYLAKEX | ||||
#ifdef NO_AVX512 | |||||
#define FORCE | |||||
#define FORCE_INTEL | |||||
#define ARCHITECTURE "X86" | |||||
#define SUBARCHITECTURE "HASWELL" | |||||
#define ARCHCONFIG "-DHASWELL " \ | |||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||||
"-DFMA3" | |||||
#define LIBNAME "haswell" | |||||
#define CORENAME "HASWELL" | |||||
#else | |||||
#define FORCE | #define FORCE | ||||
#define FORCE_INTEL | #define FORCE_INTEL | ||||
#define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
@@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define LIBNAME "skylakex" | #define LIBNAME "skylakex" | ||||
#define CORENAME "SKYLAKEX" | #define CORENAME "SKYLAKEX" | ||||
#endif | #endif | ||||
#endif | |||||
#ifdef FORCE_ATOM | #ifdef FORCE_ATOM | ||||
#define FORCE | #define FORCE | ||||
@@ -618,6 +637,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define CORENAME "POWER8" | #define CORENAME "POWER8" | ||||
#endif | #endif | ||||
#if defined(FORCE_POWER9) | |||||
#define FORCE | |||||
#define ARCHITECTURE "POWER" | |||||
#define SUBARCHITECTURE "POWER9" | |||||
#define SUBDIRNAME "power" | |||||
#define ARCHCONFIG "-DPOWER9 " \ | |||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||||
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||||
#define LIBNAME "power9" | |||||
#define CORENAME "POWER9" | |||||
#endif | |||||
#ifdef FORCE_PPCG4 | #ifdef FORCE_PPCG4 | ||||
#define FORCE | #define FORCE | ||||
@@ -1046,6 +1077,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#else | #else | ||||
#endif | #endif | ||||
#ifdef FORCE_TSV110 | |||||
#define FORCE | |||||
#define ARCHITECTURE "ARM64" | |||||
#define SUBARCHITECTURE "TSV110" | |||||
#define SUBDIRNAME "arm64" | |||||
#define ARCHCONFIG "-DTSV110 " \ | |||||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ | |||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ | |||||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||||
#define LIBNAME "tsv110" | |||||
#define CORENAME "TSV110" | |||||
#else | |||||
#endif | |||||
#ifdef FORCE_ZARCH_GENERIC | #ifdef FORCE_ZARCH_GENERIC | ||||
#define FORCE | #define FORCE | ||||
#define ARCHITECTURE "ZARCH" | #define ARCHITECTURE "ZARCH" | ||||
@@ -1066,8 +1114,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define CORENAME "Z13" | #define CORENAME "Z13" | ||||
#endif | #endif | ||||
#ifdef FORCE_Z14 | |||||
#define FORCE | |||||
#define ARCHITECTURE "ZARCH" | |||||
#define SUBARCHITECTURE "Z14" | |||||
#define ARCHCONFIG "-DZ14 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64" | |||||
#define LIBNAME "z14" | |||||
#define CORENAME "Z14" | |||||
#endif | |||||
#ifndef FORCE | #ifndef FORCE | ||||
#ifdef USER_TARGET | |||||
#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt" | |||||
#endif | |||||
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | ||||
defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) | defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) | ||||
#ifndef POWER | #ifndef POWER | ||||
@@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES | |||||
rotm.c rotmg.c # N.B. these do not have complex counterparts | rotm.c rotmg.c # N.B. these do not have complex counterparts | ||||
rot.c | rot.c | ||||
asum.c | asum.c | ||||
sum.c | |||||
) | ) | ||||
# these will have 'z' prepended for the complex version | # these will have 'z' prepended for the complex version | ||||
@@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") | GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") | ||||
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") | GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") | ||||
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") | GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") | ||||
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||||
endif () | endif () | ||||
if (${float_type} STREQUAL "ZCOMPLEX") | if (${float_type} STREQUAL "ZCOMPLEX") | ||||
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") | GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") | ||||
@@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | ||||
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | ||||
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | ||||
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||||
endif () | endif () | ||||
endforeach () | endforeach () | ||||
@@ -25,7 +25,7 @@ SBLAS1OBJS = \ | |||||
saxpy.$(SUFFIX) sswap.$(SUFFIX) \ | saxpy.$(SUFFIX) sswap.$(SUFFIX) \ | ||||
scopy.$(SUFFIX) sscal.$(SUFFIX) \ | scopy.$(SUFFIX) sscal.$(SUFFIX) \ | ||||
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ | sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ | ||||
sasum.$(SUFFIX) snrm2.$(SUFFIX) \ | |||||
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \ | |||||
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ | smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ | ||||
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ | smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ | ||||
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ | srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ | ||||
@@ -51,7 +51,7 @@ DBLAS1OBJS = \ | |||||
daxpy.$(SUFFIX) dswap.$(SUFFIX) \ | daxpy.$(SUFFIX) dswap.$(SUFFIX) \ | ||||
dcopy.$(SUFFIX) dscal.$(SUFFIX) \ | dcopy.$(SUFFIX) dscal.$(SUFFIX) \ | ||||
ddot.$(SUFFIX) \ | ddot.$(SUFFIX) \ | ||||
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ | |||||
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \ | |||||
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ | dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ | ||||
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ | dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ | ||||
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ | drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ | ||||
@@ -76,7 +76,7 @@ CBLAS1OBJS = \ | |||||
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ | caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ | ||||
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ | ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ | ||||
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ | cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ | ||||
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ | |||||
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \ | |||||
scamax.$(SUFFIX) icamax.$(SUFFIX) \ | scamax.$(SUFFIX) icamax.$(SUFFIX) \ | ||||
scamin.$(SUFFIX) icamin.$(SUFFIX) \ | scamin.$(SUFFIX) icamin.$(SUFFIX) \ | ||||
csrot.$(SUFFIX) crotg.$(SUFFIX) \ | csrot.$(SUFFIX) crotg.$(SUFFIX) \ | ||||
@@ -105,7 +105,7 @@ ZBLAS1OBJS = \ | |||||
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ | zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ | ||||
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ | zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ | ||||
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ | zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ | ||||
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ | |||||
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \ | |||||
dzamax.$(SUFFIX) izamax.$(SUFFIX) \ | dzamax.$(SUFFIX) izamax.$(SUFFIX) \ | ||||
dzamin.$(SUFFIX) izamin.$(SUFFIX) \ | dzamin.$(SUFFIX) izamin.$(SUFFIX) \ | ||||
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ | zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ | ||||
@@ -146,7 +146,7 @@ QBLAS1OBJS = \ | |||||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | ||||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | ||||
qdot.$(SUFFIX) \ | qdot.$(SUFFIX) \ | ||||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||||
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | ||||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | ||||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | ||||
@@ -168,7 +168,7 @@ XBLAS1OBJS = \ | |||||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | ||||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | ||||
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ | xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ | ||||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||||
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | ||||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | ||||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | ||||
@@ -203,7 +203,7 @@ ifdef QUAD_PRECISION | |||||
QBLAS1OBJS = \ | QBLAS1OBJS = \ | ||||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | ||||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | ||||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||||
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | ||||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | ||||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | ||||
@@ -224,7 +224,7 @@ QBLAS3OBJS = \ | |||||
XBLAS1OBJS = \ | XBLAS1OBJS = \ | ||||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | ||||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | ||||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||||
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | ||||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | ||||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | ||||
@@ -263,7 +263,8 @@ CSBLAS1OBJS = \ | |||||
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | ||||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | ||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | ||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) | |||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | |||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) | |||||
CSBLAS2OBJS = \ | CSBLAS2OBJS = \ | ||||
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | ||||
@@ -280,7 +281,8 @@ CDBLAS1OBJS = \ | |||||
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | ||||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | ||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | ||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) | |||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | |||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) | |||||
CDBLAS2OBJS = \ | CDBLAS2OBJS = \ | ||||
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | ||||
@@ -300,7 +302,8 @@ CCBLAS1OBJS = \ | |||||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | ||||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | ||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | ||||
cblas_caxpby.$(SUFFIX) | |||||
cblas_caxpby.$(SUFFIX) \ | |||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) | |||||
CCBLAS2OBJS = \ | CCBLAS2OBJS = \ | ||||
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | ||||
@@ -326,7 +329,9 @@ CZBLAS1OBJS = \ | |||||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | ||||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | ||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | ||||
cblas_zaxpby.$(SUFFIX) | |||||
cblas_zaxpby.$(SUFFIX) \ | |||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) | |||||
CZBLAS2OBJS = \ | CZBLAS2OBJS = \ | ||||
cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ | cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ | ||||
@@ -560,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c | |||||
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c | qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c | ||||
$(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c | |||||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||||
dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c | |||||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||||
qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c | |||||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||||
scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c | |||||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||||
dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c | |||||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||||
qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c | |||||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||||
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c | snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c | ||||
$(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
@@ -1383,6 +1406,18 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c | |||||
cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c | cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c | ||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | ||||
cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c | |||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | |||||
cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c | |||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | |||||
cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c | |||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c | |||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | ||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
@@ -1395,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c | |||||
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c | cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c | ||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c | |||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c | |||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c | |||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c | |||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c | cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c | ||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
@@ -1402,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c | |||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c | cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c | ||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c | cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c | ||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
@@ -0,0 +1,97 @@ | |||||
/*********************************************************************/ | |||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
/* All rights reserved. */ | |||||
/* */ | |||||
/* Redistribution and use in source and binary forms, with or */ | |||||
/* without modification, are permitted provided that the following */ | |||||
/* conditions are met: */ | |||||
/* */ | |||||
/* 1. Redistributions of source code must retain the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer. */ | |||||
/* */ | |||||
/* 2. Redistributions in binary form must reproduce the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer in the documentation and/or other materials */ | |||||
/* provided with the distribution. */ | |||||
/* */ | |||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||||
/* */ | |||||
/* The views and conclusions contained in the software and */ | |||||
/* documentation are those of the authors and should not be */ | |||||
/* interpreted as representing official policies, either expressed */ | |||||
/* or implied, of The University of Texas at Austin. */ | |||||
/*********************************************************************/ | |||||
#include <stdio.h> | |||||
#include "common.h" | |||||
#ifdef FUNCTION_PROFILE | |||||
#include "functable.h" | |||||
#endif | |||||
#ifndef CBLAS | |||||
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||||
BLASLONG n = *N; | |||||
BLASLONG incx = *INCX; | |||||
FLOATRET ret; | |||||
PRINT_DEBUG_NAME; | |||||
if (n <= 0) return 0; | |||||
IDEBUG_START; | |||||
FUNCTION_PROFILE_START(); | |||||
ret = (FLOATRET)SUM_K(n, x, incx); | |||||
FUNCTION_PROFILE_END(COMPSIZE, n, n); | |||||
IDEBUG_END; | |||||
return ret; | |||||
} | |||||
#else | |||||
#ifdef COMPLEX | |||||
FLOAT CNAME(blasint n, void *vx, blasint incx){ | |||||
FLOAT *x = (FLOAT*) vx; | |||||
#else | |||||
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||||
#endif | |||||
FLOAT ret; | |||||
PRINT_DEBUG_CNAME; | |||||
if (n <= 0) return 0; | |||||
IDEBUG_START; | |||||
FUNCTION_PROFILE_START(); | |||||
ret = SUM_K(n, x, incx); | |||||
FUNCTION_PROFILE_END(COMPSIZE, n, n); | |||||
IDEBUG_END; | |||||
return ret; | |||||
} | |||||
#endif |
@@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
buffer = (FLOAT *)blas_memory_alloc(1); | buffer = (FLOAT *)blas_memory_alloc(1); | ||||
#ifdef SMP | #ifdef SMP | ||||
/* nthreads = num_cpu_avail(2); | |||||
nthreads = num_cpu_avail(2); | |||||
FIXME trmv_thread was found to be broken, see issue 1332 */ | |||||
nthreads = 1; | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
#endif | #endif | ||||
@@ -81,6 +81,12 @@ | |||||
#endif | #endif | ||||
#endif | #endif | ||||
#ifndef COMPLEX | |||||
#define SMP_FACTOR 256 | |||||
#else | |||||
#define SMP_FACTOR 128 | |||||
#endif | |||||
static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | ||||
#ifndef TRMM | #ifndef TRMM | ||||
TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, | TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, | ||||
@@ -198,7 +204,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, | |||||
if (side < 0) info = 1; | if (side < 0) info = 1; | ||||
if (info != 0) { | if (info != 0) { | ||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)-1); | |||||
return; | return; | ||||
} | } | ||||
@@ -366,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order, | |||||
mode |= (trans << BLAS_TRANSA_SHIFT); | mode |= (trans << BLAS_TRANSA_SHIFT); | ||||
mode |= (side << BLAS_RSIDE_SHIFT); | mode |= (side << BLAS_RSIDE_SHIFT); | ||||
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||||
/* | |||||
if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD ) | |||||
args.nthreads = 1; | args.nthreads = 1; | ||||
else | else | ||||
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||||
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) | |||||
args.nthreads = 1; | args.nthreads = 1; | ||||
*/ | |||||
if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD) | |||||
args.nthreads = 1; | |||||
else | else | ||||
args.nthreads = num_cpu_avail(3); | args.nthreads = num_cpu_avail(3); | ||||
@@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
} else | } else | ||||
nthreads = 1; | nthreads = 1; | ||||
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/ | |||||
nthreads = 1; | |||||
if(nthreads > 1) { | if(nthreads > 1) { | ||||
buffer_size = n > 16 ? 0 : n * 4 + 40; | buffer_size = n > 16 ? 0 : n * 4 + 40; | ||||
} | } | ||||
@@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | ||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | ||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | ||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type}) | |||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | ||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) | ||||
@@ -340,6 +340,32 @@ ifndef XSCALKERNEL | |||||
XSCALKERNEL = zscal.S | XSCALKERNEL = zscal.S | ||||
endif | endif | ||||
### SUM ### | |||||
ifndef SSUMKERNEL | |||||
SSUMKERNEL = sum.S | |||||
endif | |||||
ifndef DSUMKERNEL | |||||
DSUMKERNEL = sum.S | |||||
endif | |||||
ifndef CSUMKERNEL | |||||
CSUMKERNEL = zsum.S | |||||
endif | |||||
ifndef ZSUMKERNEL | |||||
ZSUMKERNEL = zsum.S | |||||
endif | |||||
ifndef QSUMKERNEL | |||||
QSUMKERNEL = sum.S | |||||
endif | |||||
ifndef XSUMKERNEL | |||||
XSUMKERNEL = zsum.S | |||||
endif | |||||
### SWAP ### | ### SWAP ### | ||||
ifndef SSWAPKERNEL | ifndef SSWAPKERNEL | ||||
@@ -453,7 +479,7 @@ endif | |||||
SBLASOBJS += \ | SBLASOBJS += \ | ||||
samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ | samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ | ||||
isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ | isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ | ||||
sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||||
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||||
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | ||||
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | ||||
saxpby_k$(TSUFFIX).$(SUFFIX) | saxpby_k$(TSUFFIX).$(SUFFIX) | ||||
@@ -463,31 +489,32 @@ DBLASOBJS += \ | |||||
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | ||||
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | ||||
daxpby_k$(TSUFFIX).$(SUFFIX) | |||||
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||||
QBLASOBJS += \ | QBLASOBJS += \ | ||||
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | ||||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) | |||||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | |||||
qsum_k$(TSUFFIX).$(SUFFIX) | |||||
CBLASOBJS += \ | CBLASOBJS += \ | ||||
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | ||||
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ | casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ | ||||
cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ | cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ | ||||
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) | |||||
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX) | |||||
ZBLASOBJS += \ | ZBLASOBJS += \ | ||||
zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ | zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ | ||||
zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ | zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ | ||||
zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ | zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ | ||||
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) | |||||
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX) | |||||
XBLASOBJS += \ | XBLASOBJS += \ | ||||
xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ | xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ | ||||
xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ | xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ | ||||
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ | xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ | ||||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) | |||||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) | |||||
### AMAX ### | ### AMAX ### | ||||
@@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||||
$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) | $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) | ||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ | $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ | ||||
### ASUM ### | |||||
$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) | $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) | ||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | ||||
@@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||||
$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) | $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) | ||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | ||||
### SUM ### | |||||
$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL) | |||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||||
$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL) | |||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ | |||||
$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL) | |||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ | |||||
$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL) | |||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ | |||||
$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL) | |||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ | |||||
$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL) | |||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | |||||
### AXPY ### | |||||
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) | $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) | ||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | ||||
@@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B) | |||||
USE_TRMM = 1 | USE_TRMM = 1 | ||||
endif | endif | ||||
ifeq ($(TARGET), GENERIC) | |||||
ifeq ($(CORE), GENERIC) | |||||
USE_TRMM = 1 | USE_TRMM = 1 | ||||
endif | endif | ||||
@@ -44,10 +44,18 @@ ifeq ($(CORE), POWER8) | |||||
USE_TRMM = 1 | USE_TRMM = 1 | ||||
endif | endif | ||||
ifeq ($(CORE), POWER9) | |||||
USE_TRMM = 1 | |||||
endif | |||||
ifeq ($(ARCH), zarch) | ifeq ($(ARCH), zarch) | ||||
USE_TRMM = 1 | USE_TRMM = 1 | ||||
endif | endif | ||||
ifeq ($(CORE), Z14) | |||||
USE_TRMM = 1 | |||||
endif | |||||
@@ -0,0 +1,206 @@ | |||||
/*********************************************************************/ | |||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
/* All rights reserved. */ | |||||
/* */ | |||||
/* Redistribution and use in source and binary forms, with or */ | |||||
/* without modification, are permitted provided that the following */ | |||||
/* conditions are met: */ | |||||
/* */ | |||||
/* 1. Redistributions of source code must retain the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer. */ | |||||
/* */ | |||||
/* 2. Redistributions in binary form must reproduce the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer in the documentation and/or other materials */ | |||||
/* provided with the distribution. */ | |||||
/* */ | |||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||||
/* */ | |||||
/* The views and conclusions contained in the software and */ | |||||
/* documentation are those of the authors and should not be */ | |||||
/* interpreted as representing official policies, either expressed */ | |||||
/* or implied, of The University of Texas at Austin. */ | |||||
/*********************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "version.h" | |||||
#define PREFETCHSIZE 88 | |||||
#define N $16 | |||||
#define X $17 | |||||
#define INCX $18 | |||||
#define I $19 | |||||
#define s0 $f0 | |||||
#define s1 $f1 | |||||
#define s2 $f10 | |||||
#define s3 $f11 | |||||
#define a0 $f12 | |||||
#define a1 $f13 | |||||
#define a2 $f14 | |||||
#define a3 $f15 | |||||
#define a4 $f16 | |||||
#define a5 $f17 | |||||
#define a6 $f18 | |||||
#define a7 $f19 | |||||
#define t0 $f20 | |||||
#define t1 $f21 | |||||
#define t2 $f22 | |||||
#define t3 $f23 | |||||
PROLOGUE | |||||
PROFCODE | |||||
fclr s0 | |||||
unop | |||||
fclr t0 | |||||
ble N, $L999 | |||||
sra N, 3, I | |||||
fclr s1 | |||||
fclr s2 | |||||
ble I, $L15 | |||||
LD a0, 0 * SIZE(X) | |||||
fclr t1 | |||||
SXADDQ INCX, X, X | |||||
fclr t2 | |||||
LD a1, 0 * SIZE(X) | |||||
fclr t3 | |||||
SXADDQ INCX, X, X | |||||
fclr s3 | |||||
LD a2, 0 * SIZE(X) | |||||
SXADDQ INCX, X, X | |||||
LD a3, 0 * SIZE(X) | |||||
SXADDQ INCX, X, X | |||||
LD a4, 0 * SIZE(X) | |||||
SXADDQ INCX, X, X | |||||
LD a5, 0 * SIZE(X) | |||||
SXADDQ INCX, X, X | |||||
lda I, -1(I) | |||||
ble I, $L13 | |||||
.align 4 | |||||
$L12: | |||||
ADD s0, t0, s0 | |||||
ldl $31, PREFETCHSIZE * 2 * SIZE(X) | |||||
fmov a0, t0 | |||||
lda I, -1(I) | |||||
ADD s1, t1, s1 | |||||
LD a6, 0 * SIZE(X) | |||||
fmov a1, t1 | |||||
SXADDQ INCX, X, X | |||||
ADD s2, t2, s2 | |||||
LD a7, 0 * SIZE(X) | |||||
fmov a2, t2 | |||||
SXADDQ INCX, X, X | |||||
ADD s3, t3, s3 | |||||
LD a0, 0 * SIZE(X) | |||||
fmov a3, t3 | |||||
SXADDQ INCX, X, X | |||||
ADD s0, t0, s0 | |||||
LD a1, 0 * SIZE(X) | |||||
fmov a4, t0 | |||||
SXADDQ INCX, X, X | |||||
ADD s1, t1, s1 | |||||
LD a2, 0 * SIZE(X) | |||||
fmov a5, t1 | |||||
SXADDQ INCX, X, X | |||||
ADD s2, t2, s2 | |||||
LD a3, 0 * SIZE(X) | |||||
fmov a6, t2 | |||||
SXADDQ INCX, X, X | |||||
ADD s3, t3, s3 | |||||
LD a4, 0 * SIZE(X) | |||||
fmov a7, t3 | |||||
SXADDQ INCX, X, X | |||||
LD a5, 0 * SIZE(X) | |||||
unop | |||||
SXADDQ INCX, X, X | |||||
bne I, $L12 | |||||
.align 4 | |||||
$L13: | |||||
ADD s0, t0, s0 | |||||
LD a6, 0 * SIZE(X) | |||||
fmov a0, t0 | |||||
SXADDQ INCX, X, X | |||||
ADD s1, t1, s1 | |||||
LD a7, 0 * SIZE(X) | |||||
fmov a1, t1 | |||||
SXADDQ INCX, X, X | |||||
ADD s2, t2, s2 | |||||
fmov a2, t2 | |||||
ADD s3, t3, s3 | |||||
fmov a3, t3 | |||||
ADD s0, t0, s0 | |||||
fmov a4, t0 | |||||
ADD s1, t1, s1 | |||||
fmov a5, t1 | |||||
ADD s2, t2, s2 | |||||
fmov a6, t2 | |||||
ADD s3, t3, s3 | |||||
fmov a7, t3 | |||||
ADD s1, t1, s1 | |||||
ADD s2, t2, s2 | |||||
ADD s3, t3, s3 | |||||
ADD s0, s1, s0 | |||||
ADD s2, s3, s2 | |||||
.align 4 | |||||
$L15: | |||||
and N, 7, I | |||||
ADD s0, s2, s0 | |||||
unop | |||||
ble I, $L999 | |||||
.align 4 | |||||
$L17: | |||||
ADD s0, t0, s0 | |||||
LD a0, 0 * SIZE(X) | |||||
SXADDQ INCX, X, X | |||||
fmov a0, t0 | |||||
lda I, -1(I) | |||||
bne I, $L17 | |||||
.align 4 | |||||
$L999: | |||||
ADD s0, t0, s0 | |||||
ret | |||||
EPILOGUE |
@@ -0,0 +1,208 @@ | |||||
/*********************************************************************/ | |||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
/* All rights reserved. */ | |||||
/* */ | |||||
/* Redistribution and use in source and binary forms, with or */ | |||||
/* without modification, are permitted provided that the following */ | |||||
/* conditions are met: */ | |||||
/* */ | |||||
/* 1. Redistributions of source code must retain the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer. */ | |||||
/* */ | |||||
/* 2. Redistributions in binary form must reproduce the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer in the documentation and/or other materials */ | |||||
/* provided with the distribution. */ | |||||
/* */ | |||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||||
/* */ | |||||
/* The views and conclusions contained in the software and */ | |||||
/* documentation are those of the authors and should not be */ | |||||
/* interpreted as representing official policies, either expressed */ | |||||
/* or implied, of The University of Texas at Austin. */ | |||||
/*********************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "version.h" | |||||
#define PREFETCHSIZE 88 | |||||
#define N $16 | |||||
#define X $17 | |||||
#define INCX $18 | |||||
#define I $19 | |||||
#define s0 $f0 | |||||
#define s1 $f1 | |||||
#define s2 $f10 | |||||
#define s3 $f11 | |||||
#define a0 $f12 | |||||
#define a1 $f13 | |||||
#define a2 $f14 | |||||
#define a3 $f15 | |||||
#define a4 $f16 | |||||
#define a5 $f17 | |||||
#define a6 $f18 | |||||
#define a7 $f19 | |||||
#define t0 $f20 | |||||
#define t1 $f21 | |||||
#define t2 $f22 | |||||
#define t3 $f23 | |||||
PROLOGUE | |||||
PROFCODE | |||||
fclr s0 | |||||
unop | |||||
fclr t0 | |||||
addq INCX, INCX, INCX | |||||
fclr s1 | |||||
unop | |||||
fclr t1 | |||||
ble N, $L999 | |||||
fclr s2 | |||||
sra N, 2, I | |||||
fclr s3 | |||||
ble I, $L15 | |||||
LD a0, 0 * SIZE(X) | |||||
fclr t2 | |||||
LD a1, 1 * SIZE(X) | |||||
SXADDQ INCX, X, X | |||||
LD a2, 0 * SIZE(X) | |||||
fclr t3 | |||||
LD a3, 1 * SIZE(X) | |||||
SXADDQ INCX, X, X | |||||
LD a4, 0 * SIZE(X) | |||||
LD a5, 1 * SIZE(X) | |||||
SXADDQ INCX, X, X | |||||
lda I, -1(I) | |||||
ble I, $L13 | |||||
.align 4 | |||||
$L12: | |||||
ADD s0, t0, s0 | |||||
ldl $31, PREFETCHSIZE * SIZE(X) | |||||
fmov a0, t0 | |||||
lda I, -1(I) | |||||
ADD s1, t1, s1 | |||||
LD a6, 0 * SIZE(X) | |||||
fmov a1, t1 | |||||
unop | |||||
ADD s2, t2, s2 | |||||
LD a7, 1 * SIZE(X) | |||||
fmov a2, t2 | |||||
SXADDQ INCX, X, X | |||||
ADD s3, t3, s3 | |||||
LD a0, 0 * SIZE(X) | |||||
fmov a3, t3 | |||||
unop | |||||
ADD s0, t0, s0 | |||||
LD a1, 1 * SIZE(X) | |||||
fmov a4, t0 | |||||
SXADDQ INCX, X, X | |||||
ADD s1, t1, s1 | |||||
LD a2, 0 * SIZE(X) | |||||
fmov a5, t1 | |||||
unop | |||||
ADD s2, t2, s2 | |||||
LD a3, 1 * SIZE(X) | |||||
fmov a6, t2 | |||||
SXADDQ INCX, X, X | |||||
ADD s3, t3, s3 | |||||
LD a4, 0 * SIZE(X) | |||||
fmov a7, t3 | |||||
unop | |||||
LD a5, 1 * SIZE(X) | |||||
unop | |||||
SXADDQ INCX, X, X | |||||
bne I, $L12 | |||||
.align 4 | |||||
$L13: | |||||
ADD s0, t0, s0 | |||||
LD a6, 0 * SIZE(X) | |||||
fmov a0, t0 | |||||
ADD s1, t1, s1 | |||||
LD a7, 1 * SIZE(X) | |||||
fmov a1, t1 | |||||
SXADDQ INCX, X, X | |||||
ADD s2, t2, s2 | |||||
fmov a2, t2 | |||||
ADD s3, t3, s3 | |||||
fmov a3, t3 | |||||
ADD s0, t0, s0 | |||||
fmov a4, t0 | |||||
ADD s1, t1, s1 | |||||
fmov a5, t1 | |||||
ADD s2, t2, s2 | |||||
fmov a6, t2 | |||||
ADD s3, t3, s3 | |||||
fmov a7, t3 | |||||
ADD s2, t2, s2 | |||||
ADD s3, t3, s3 | |||||
.align 4 | |||||
$L15: | |||||
ADD s0, s2, s0 | |||||
and N, 3, I | |||||
ADD s1, s3, s1 | |||||
ble I, $L999 | |||||
.align 4 | |||||
$L17: | |||||
ADD s0, t0, s0 | |||||
LD a0, 0 * SIZE(X) | |||||
fmov a0, t0 | |||||
lda I, -1(I) | |||||
ADD s1, t1, s1 | |||||
LD a1, 1 * SIZE(X) | |||||
fmov a1, t1 | |||||
SXADDQ INCX, X, X | |||||
bne I, $L17 | |||||
.align 4 | |||||
$L999: | |||||
ADD s0, t0, s0 | |||||
ADD s1, t1, s1 | |||||
ADD s0, s1, s0 | |||||
ret | |||||
EPILOGUE |
@@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c | |||||
CASUMKERNEL = ../arm/zasum.c | CASUMKERNEL = ../arm/zasum.c | ||||
ZASUMKERNEL = ../arm/zasum.c | ZASUMKERNEL = ../arm/zasum.c | ||||
SSUMKERNEL = ../arm/sum.c | |||||
DSUMKERNEL = ../arm/sum.c | |||||
CSUMKERNEL = ../arm/zsum.c | |||||
ZSUMKERNEL = ../arm/zsum.c | |||||
SAXPYKERNEL = ../arm/axpy.c | SAXPYKERNEL = ../arm/axpy.c | ||||
DAXPYKERNEL = ../arm/axpy.c | DAXPYKERNEL = ../arm/axpy.c | ||||
CAXPYKERNEL = ../arm/zaxpy.c | CAXPYKERNEL = ../arm/zaxpy.c | ||||
@@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S | |||||
CASUMKERNEL = asum_vfp.S | CASUMKERNEL = asum_vfp.S | ||||
ZASUMKERNEL = asum_vfp.S | ZASUMKERNEL = asum_vfp.S | ||||
SSUMKERNEL = sum_vfp.S | |||||
DSUMKERNEL = sum_vfp.S | |||||
SAXPYKERNEL = axpy_vfp.S | SAXPYKERNEL = axpy_vfp.S | ||||
DAXPYKERNEL = axpy_vfp.S | DAXPYKERNEL = axpy_vfp.S | ||||
CAXPYKERNEL = axpy_vfp.S | CAXPYKERNEL = axpy_vfp.S | ||||
@@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
while(i < n) | while(i < n) | ||||
{ | { | ||||
if( x[ix] > minf ) | |||||
if( x[ix] < minf ) | |||||
{ | { | ||||
min = i; | min = i; | ||||
minf = x[ix]; | minf = x[ix]; | ||||
@@ -0,0 +1,51 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
/************************************************************************************** | |||||
* trivial copy of asum.c with the ABS() removed * | |||||
**************************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | |||||
BLASLONG i=0; | |||||
FLOAT sumf = 0.0; | |||||
if (n <= 0 || inc_x <= 0) return(sumf); | |||||
n *= inc_x; | |||||
while(i < n) | |||||
{ | |||||
sumf += x[i]; | |||||
i += inc_x; | |||||
} | |||||
return(sumf); | |||||
} | |||||
@@ -0,0 +1,425 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
/************************************************************************************** | |||||
* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed * | |||||
**************************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define STACKSIZE 256 | |||||
#define N r0 | |||||
#define X r1 | |||||
#define INC_X r2 | |||||
#define I r12 | |||||
#define X_PRE 512 | |||||
/************************************************************************************** | |||||
* Macro definitions | |||||
**************************************************************************************/ | |||||
#if !defined(COMPLEX) | |||||
#if defined(DOUBLE) | |||||
.macro KERNEL_F4 | |||||
pld [ X, #X_PRE ] | |||||
vldmia.f64 X!, { d4 - d5 } | |||||
vadd.f64 d0 , d0, d4 | |||||
vldmia.f64 X!, { d6 - d7 } | |||||
vadd.f64 d1 , d1, d5 | |||||
vadd.f64 d0 , d0, d6 | |||||
vadd.f64 d1 , d1, d7 | |||||
.endm | |||||
.macro KERNEL_F1 | |||||
vldmia.f64 X!, { d4 } | |||||
vadd.f64 d0 , d0, d4 | |||||
.endm | |||||
.macro KERNEL_S4 | |||||
vldmia.f64 X, { d4 } | |||||
vadd.f64 d0 , d0, d4 | |||||
add X, X, INC_X | |||||
vldmia.f64 X, { d4 } | |||||
vadd.f64 d0 , d0, d4 | |||||
add X, X, INC_X | |||||
vldmia.f64 X, { d4 } | |||||
vadd.f64 d0 , d0, d4 | |||||
add X, X, INC_X | |||||
vldmia.f64 X, { d4 } | |||||
vadd.f64 d0 , d0, d4 | |||||
add X, X, INC_X | |||||
.endm | |||||
.macro KERNEL_S1 | |||||
vldmia.f64 X, { d4 } | |||||
vadd.f64 d0 , d0, d4 | |||||
add X, X, INC_X | |||||
.endm | |||||
#else | |||||
.macro KERNEL_F4 | |||||
vldmia.f32 X!, { s4 - s5 } | |||||
vadd.f32 s0 , s0, s4 | |||||
vldmia.f32 X!, { s6 - s7 } | |||||
vadd.f32 s1 , s1, s5 | |||||
vadd.f32 s0 , s0, s6 | |||||
vadd.f32 s1 , s1, s7 | |||||
.endm | |||||
.macro KERNEL_F1 | |||||
vldmia.f32 X!, { s4 } | |||||
vadd.f32 s0 , s0, s4 | |||||
.endm | |||||
.macro KERNEL_S4 | |||||
vldmia.f32 X, { s4 } | |||||
vadd.f32 s0 , s0, s4 | |||||
add X, X, INC_X | |||||
vldmia.f32 X, { s4 } | |||||
vadd.f32 s0 , s0, s4 | |||||
add X, X, INC_X | |||||
vldmia.f32 X, { s4 } | |||||
vadd.f32 s0 , s0, s4 | |||||
add X, X, INC_X | |||||
vldmia.f32 X, { s4 } | |||||
vadd.f32 s0 , s0, s4 | |||||
add X, X, INC_X | |||||
.endm | |||||
.macro KERNEL_S1 | |||||
vldmia.f32 X, { s4 } | |||||
vadd.f32 s0 , s0, s4 | |||||
add X, X, INC_X | |||||
.endm | |||||
#endif | |||||
#else | |||||
#if defined(DOUBLE) | |||||
.macro KERNEL_F4 | |||||
pld [ X, #X_PRE ] | |||||
vldmia.f64 X!, { d4 - d5 } | |||||
vadd.f64 d0 , d0, d4 | |||||
vldmia.f64 X!, { d6 - d7 } | |||||
vadd.f64 d1 , d1, d5 | |||||
vadd.f64 d0 , d0, d6 | |||||
vadd.f64 d1 , d1, d7 | |||||
pld [ X, #X_PRE ] | |||||
vldmia.f64 X!, { d4 - d5 } | |||||
vadd.f64 d0 , d0, d4 | |||||
vldmia.f64 X!, { d6 - d7 } | |||||
vadd.f64 d1 , d1, d5 | |||||
vadd.f64 d0 , d0, d6 | |||||
vadd.f64 d1 , d1, d7 | |||||
.endm | |||||
.macro KERNEL_F1 | |||||
vldmia.f64 X!, { d4 } | |||||
vadd.f64 d0 , d0, d4 | |||||
vldmia.f64 X!, { d4 } | |||||
vadd.f64 d0 , d0, d4 | |||||
.endm | |||||
.macro KERNEL_S4 | |||||
vldmia.f64 X, { d4 -d5 } | |||||
vadd.f64 d0 , d0, d4 | |||||
vadd.f64 d0 , d0, d5 | |||||
add X, X, INC_X | |||||
vldmia.f64 X, { d4 -d5 } | |||||
vadd.f64 d0 , d0, d4 | |||||
vadd.f64 d0 , d0, d5 | |||||
add X, X, INC_X | |||||
vldmia.f64 X, { d4 -d5 } | |||||
vadd.f64 d0 , d0, d4 | |||||
vadd.f64 d0 , d0, d5 | |||||
add X, X, INC_X | |||||
vldmia.f64 X, { d4 -d5 } | |||||
vadd.f64 d0 , d0, d4 | |||||
vadd.f64 d0 , d0, d5 | |||||
add X, X, INC_X | |||||
.endm | |||||
.macro KERNEL_S1 | |||||
vldmia.f64 X, { d4 -d5 } | |||||
vadd.f64 d0 , d0, d4 | |||||
vadd.f64 d0 , d0, d5 | |||||
add X, X, INC_X | |||||
.endm | |||||
#else | |||||
.macro KERNEL_F4 | |||||
pld [ X, #X_PRE ] | |||||
vldmia.f32 X!, { s4 - s5 } | |||||
vadd.f32 s0 , s0, s4 | |||||
vldmia.f32 X!, { s6 - s7 } | |||||
vadd.f32 s1 , s1, s5 | |||||
vadd.f32 s0 , s0, s6 | |||||
vadd.f32 s1 , s1, s7 | |||||
vldmia.f32 X!, { s4 - s5 } | |||||
vadd.f32 s0 , s0, s4 | |||||
vldmia.f32 X!, { s6 - s7 } | |||||
vadd.f32 s1 , s1, s5 | |||||
vadd.f32 s0 , s0, s6 | |||||
vadd.f32 s1 , s1, s7 | |||||
.endm | |||||
.macro KERNEL_F1 | |||||
vldmia.f32 X!, { s4 } | |||||
vadd.f32 s0 , s0, s4 | |||||
vldmia.f32 X!, { s4 } | |||||
vadd.f32 s0 , s0, s4 | |||||
.endm | |||||
.macro KERNEL_S4 | |||||
vldmia.f32 X, { s4 -s5 } | |||||
vadd.f32 s0 , s0, s4 | |||||
vadd.f32 s0 , s0, s5 | |||||
add X, X, INC_X | |||||
vldmia.f32 X, { s4 -s5 } | |||||
vadd.f32 s0 , s0, s4 | |||||
vadd.f32 s0 , s0, s5 | |||||
add X, X, INC_X | |||||
vldmia.f32 X, { s4 -s5 } | |||||
vadd.f32 s0 , s0, s4 | |||||
vadd.f32 s0 , s0, s5 | |||||
add X, X, INC_X | |||||
vldmia.f32 X, { s4 -s5 } | |||||
vadd.f32 s0 , s0, s4 | |||||
vadd.f32 s0 , s0, s5 | |||||
add X, X, INC_X | |||||
.endm | |||||
.macro KERNEL_S1 | |||||
vldmia.f32 X, { s4 -s5 } | |||||
vadd.f32 s0 , s0, s4 | |||||
vadd.f32 s0 , s0, s5 | |||||
add X, X, INC_X | |||||
.endm | |||||
#endif | |||||
#endif | |||||
/************************************************************************************** | |||||
* End of macro definitions | |||||
**************************************************************************************/ | |||||
PROLOGUE | |||||
.align 5 | |||||
movs r12, #0 // clear floating point register | |||||
vmov s0, r12 | |||||
vmov s1, r12 | |||||
#if defined(DOUBLE) | |||||
vcvt.f64.f32 d0, s0 | |||||
vcvt.f64.f32 d1, s1 | |||||
#endif | |||||
cmp N, #0 | |||||
ble asum_kernel_L999 | |||||
cmp INC_X, #0 | |||||
beq asum_kernel_L999 | |||||
cmp INC_X, #1 | |||||
bne asum_kernel_S_BEGIN | |||||
asum_kernel_F_BEGIN: | |||||
asrs I, N, #2 // I = N / 4 | |||||
ble asum_kernel_F1 | |||||
.align 5 | |||||
asum_kernel_F4: | |||||
#if !defined(DOUBLE) && !defined(COMPLEX) | |||||
pld [ X, #X_PRE ] | |||||
#endif | |||||
KERNEL_F4 | |||||
subs I, I, #1 | |||||
ble asum_kernel_F1 | |||||
KERNEL_F4 | |||||
subs I, I, #1 | |||||
bne asum_kernel_F4 | |||||
asum_kernel_F1: | |||||
ands I, N, #3 | |||||
ble asum_kernel_L999 | |||||
asum_kernel_F10: | |||||
KERNEL_F1 | |||||
subs I, I, #1 | |||||
bne asum_kernel_F10 | |||||
b asum_kernel_L999 | |||||
asum_kernel_S_BEGIN: | |||||
#if defined(COMPLEX) | |||||
#if defined(DOUBLE) | |||||
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
#else | |||||
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
#endif | |||||
#else | |||||
#if defined(DOUBLE) | |||||
lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
#else | |||||
lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
#endif | |||||
#endif | |||||
asrs I, N, #2 // I = N / 4 | |||||
ble asum_kernel_S1 | |||||
.align 5 | |||||
asum_kernel_S4: | |||||
KERNEL_S4 | |||||
subs I, I, #1 | |||||
bne asum_kernel_S4 | |||||
asum_kernel_S1: | |||||
ands I, N, #3 | |||||
ble asum_kernel_L999 | |||||
asum_kernel_S10: | |||||
KERNEL_S1 | |||||
subs I, I, #1 | |||||
bne asum_kernel_S10 | |||||
asum_kernel_L999: | |||||
#if defined(DOUBLE) | |||||
vadd.f64 d0 , d0, d1 // set return value | |||||
#else | |||||
vadd.f32 s0 , s0, s1 // set return value | |||||
#endif | |||||
#if !defined(__ARM_PCS_VFP) | |||||
#if !defined(DOUBLE) | |||||
vmov r0, s0 | |||||
#else | |||||
vmov r0, r1, d0 | |||||
#endif | |||||
#endif | |||||
bx lr | |||||
EPILOGUE | |||||
@@ -0,0 +1,57 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
/************************************************************************************** | |||||
* trivial copy of zasum.c with the ABS() removed * | |||||
**************************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#define CSUM1(x,i) x[i]+x[i+1] | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | |||||
BLASLONG i=0; | |||||
FLOAT sumf = 0.0; | |||||
BLASLONG inc_x2; | |||||
if (n <= 0 || inc_x <= 0) return(sumf); | |||||
inc_x2 = 2 * inc_x; | |||||
n *= inc_x2; | |||||
while(i < n) | |||||
{ | |||||
sumf += CSUM1(x,i); | |||||
i += inc_x2; | |||||
} | |||||
return(sumf); | |||||
} | |||||
@@ -0,0 +1,175 @@ | |||||
SAMINKERNEL = ../arm/amin.c | |||||
DAMINKERNEL = ../arm/amin.c | |||||
CAMINKERNEL = ../arm/zamin.c | |||||
ZAMINKERNEL = ../arm/zamin.c | |||||
SMAXKERNEL = ../arm/max.c | |||||
DMAXKERNEL = ../arm/max.c | |||||
SMINKERNEL = ../arm/min.c | |||||
DMINKERNEL = ../arm/min.c | |||||
ISAMINKERNEL = ../arm/iamin.c | |||||
IDAMINKERNEL = ../arm/iamin.c | |||||
ICAMINKERNEL = ../arm/izamin.c | |||||
IZAMINKERNEL = ../arm/izamin.c | |||||
ISMAXKERNEL = ../arm/imax.c | |||||
IDMAXKERNEL = ../arm/imax.c | |||||
ISMINKERNEL = ../arm/imin.c | |||||
IDMINKERNEL = ../arm/imin.c | |||||
STRMMKERNEL = ../generic/trmmkernel_4x4.c | |||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
SAMAXKERNEL = amax.S | |||||
DAMAXKERNEL = amax.S | |||||
CAMAXKERNEL = zamax.S | |||||
ZAMAXKERNEL = zamax.S | |||||
ISAMAXKERNEL = iamax.S | |||||
IDAMAXKERNEL = iamax.S | |||||
ICAMAXKERNEL = izamax.S | |||||
IZAMAXKERNEL = izamax.S | |||||
SASUMKERNEL = asum.S | |||||
DASUMKERNEL = asum.S | |||||
CASUMKERNEL = casum.S | |||||
ZASUMKERNEL = zasum.S | |||||
SAXPYKERNEL = axpy.S | |||||
DAXPYKERNEL = axpy.S | |||||
CAXPYKERNEL = zaxpy.S | |||||
ZAXPYKERNEL = zaxpy.S | |||||
SCOPYKERNEL = copy.S | |||||
DCOPYKERNEL = copy.S | |||||
CCOPYKERNEL = copy.S | |||||
ZCOPYKERNEL = copy.S | |||||
SDOTKERNEL = dot.S | |||||
DDOTKERNEL = dot.S | |||||
CDOTKERNEL = zdot.S | |||||
ZDOTKERNEL = zdot.S | |||||
DSDOTKERNEL = dot.S | |||||
SNRM2KERNEL = nrm2.S | |||||
DNRM2KERNEL = nrm2.S | |||||
CNRM2KERNEL = znrm2.S | |||||
ZNRM2KERNEL = znrm2.S | |||||
SROTKERNEL = rot.S | |||||
DROTKERNEL = rot.S | |||||
CROTKERNEL = zrot.S | |||||
ZROTKERNEL = zrot.S | |||||
SSCALKERNEL = scal.S | |||||
DSCALKERNEL = scal.S | |||||
CSCALKERNEL = zscal.S | |||||
ZSCALKERNEL = zscal.S | |||||
SSWAPKERNEL = swap.S | |||||
DSWAPKERNEL = swap.S | |||||
CSWAPKERNEL = swap.S | |||||
ZSWAPKERNEL = swap.S | |||||
SGEMVNKERNEL = gemv_n.S | |||||
DGEMVNKERNEL = gemv_n.S | |||||
CGEMVNKERNEL = zgemv_n.S | |||||
ZGEMVNKERNEL = zgemv_n.S | |||||
SGEMVTKERNEL = gemv_t.S | |||||
DGEMVTKERNEL = gemv_t.S | |||||
CGEMVTKERNEL = zgemv_t.S | |||||
ZGEMVTKERNEL = zgemv_t.S | |||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||||
ifeq ($(DGEMM_UNROLL_M), 8) | |||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||||
else | |||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||||
endif | |||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
ifeq ($(DGEMM_UNROLL_N), 4) | |||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||||
else | |||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||||
endif | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
endif | |||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
@@ -0,0 +1,164 @@ | |||||
/******************************************************************************* | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*******************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define N x0 /* vector length */ | |||||
#define X x1 /* X vector address */ | |||||
#define INC_X x2 /* X stride */ | |||||
#define I x5 /* loop variable */ | |||||
/******************************************************************************* | |||||
* Macro definitions | |||||
*******************************************************************************/ | |||||
#define REG0 wzr | |||||
#define SUMF s0 | |||||
#define TMPF s1 | |||||
#define TMPVF {v1.s}[0] | |||||
#define SZ 4 | |||||
/******************************************************************************/ | |||||
.macro KERNEL_F1 | |||||
ld1 {v1.2s}, [X], #8 | |||||
ext v2.8b, v1.8b, v1.8b, #4 | |||||
fadd TMPF, TMPF, s2 | |||||
fadd SUMF, SUMF, TMPF | |||||
.endm | |||||
.macro KERNEL_F8 | |||||
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] | |||||
add X, X, #64 | |||||
PRFM PLDL1KEEP, [X, #1024] | |||||
fadd v1.4s, v1.4s, v2.4s | |||||
fadd v3.4s, v3.4s, v4.4s | |||||
fadd v0.4s, v0.4s, v1.4s | |||||
fadd v0.4s, v0.4s, v3.4s | |||||
.endm | |||||
.macro KERNEL_F8_FINALIZE | |||||
ext v1.16b, v0.16b, v0.16b, #8 | |||||
fadd v0.2s, v0.2s, v1.2s | |||||
faddp SUMF, v0.2s | |||||
.endm | |||||
.macro INIT_S | |||||
lsl INC_X, INC_X, #3 | |||||
.endm | |||||
.macro KERNEL_S1 | |||||
ld1 {v1.2s}, [X], INC_X | |||||
ext v2.8b, v1.8b, v1.8b, #4 | |||||
fadd TMPF, TMPF, s2 | |||||
fadd SUMF, SUMF, TMPF | |||||
.endm | |||||
/******************************************************************************* | |||||
* End of macro definitions | |||||
*******************************************************************************/ | |||||
PROLOGUE | |||||
fmov SUMF, REG0 | |||||
fmov s1, SUMF | |||||
cmp N, xzr | |||||
ble .Lcsum_kernel_L999 | |||||
cmp INC_X, xzr | |||||
ble .Lcsum_kernel_L999 | |||||
cmp INC_X, #1 | |||||
bne .Lcsum_kernel_S_BEGIN | |||||
.Lcsum_kernel_F_BEGIN: | |||||
asr I, N, #3 | |||||
cmp I, xzr | |||||
beq .Lcsum_kernel_F1 | |||||
.Lcsum_kernel_F8: | |||||
KERNEL_F8 | |||||
subs I, I, #1 | |||||
bne .Lcsum_kernel_F8 | |||||
KERNEL_F8_FINALIZE | |||||
.Lcsum_kernel_F1: | |||||
ands I, N, #7 | |||||
ble .Lcsum_kernel_L999 | |||||
.Lcsum_kernel_F10: | |||||
KERNEL_F1 | |||||
subs I, I, #1 | |||||
bne .Lcsum_kernel_F10 | |||||
.Lcsum_kernel_L999: | |||||
ret | |||||
.Lcsum_kernel_S_BEGIN: | |||||
INIT_S | |||||
asr I, N, #2 | |||||
cmp I, xzr | |||||
ble .Lcsum_kernel_S1 | |||||
.Lcsum_kernel_S4: | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
subs I, I, #1 | |||||
bne .Lcsum_kernel_S4 | |||||
.Lcsum_kernel_S1: | |||||
ands I, N, #3 | |||||
ble .Lcsum_kernel_L999 | |||||
.Lcsum_kernel_S10: | |||||
KERNEL_S1 | |||||
subs I, I, #1 | |||||
bne .Lcsum_kernel_S10 | |||||
ret | |||||
EPILOGUE |
@@ -0,0 +1,186 @@ | |||||
/******************************************************************************* | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*******************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define N x0 /* vector length */ | |||||
#define X x1 /* X vector address */ | |||||
#define INC_X x2 /* X stride */ | |||||
#define I x5 /* loop variable */ | |||||
/******************************************************************************* | |||||
* Macro definitions | |||||
*******************************************************************************/ | |||||
#if !defined(DOUBLE) | |||||
#define REG0 wzr | |||||
#define SUMF s0 | |||||
#define TMPF s1 | |||||
#define TMPVF {v1.s}[0] | |||||
#define SZ 4 | |||||
#else | |||||
#define REG0 xzr | |||||
#define SUMF d0 | |||||
#define TMPF d1 | |||||
#define TMPVF {v1.d}[0] | |||||
#define SZ 8 | |||||
#endif | |||||
/******************************************************************************/ | |||||
.macro KERNEL_F1 | |||||
ldr TMPF, [X], #SZ | |||||
fadd SUMF, SUMF, TMPF | |||||
.endm | |||||
.macro KERNEL_F8 | |||||
#if !defined(DOUBLE) | |||||
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] | |||||
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] | |||||
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] | |||||
PRFM PLDL1KEEP, [X, #1024] | |||||
#else // DOUBLE | |||||
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] | |||||
add X, X, #64 | |||||
PRFM PLDL1KEEP, [X, #1024] | |||||
fadd v2.2d, v2.2d, v3.2d | |||||
fadd v4.2d, v4.2d, v5.2d | |||||
fadd v0.2d, v0.2d, v2.2d | |||||
fadd v0.2d, v0.2d, v4.2d | |||||
#endif | |||||
.endm | |||||
.macro KERNEL_F8_FINALIZE | |||||
#if !defined(DOUBLE) | |||||
ext v1.16b, v0.16b, v0.16b, #8 | |||||
fadd v0.2s, v0.2s, v1.2s | |||||
faddp SUMF, v0.2s | |||||
#else | |||||
faddp SUMF, v0.2d | |||||
#endif | |||||
.endm | |||||
.macro INIT_S | |||||
#if !defined(DOUBLE) | |||||
lsl INC_X, INC_X, #2 | |||||
#else | |||||
lsl INC_X, INC_X, #3 | |||||
#endif | |||||
.endm | |||||
.macro KERNEL_S1 | |||||
ld1 TMPVF, [X], INC_X | |||||
fadd SUMF, SUMF, TMPF | |||||
.endm | |||||
/******************************************************************************* | |||||
* End of macro definitions | |||||
*******************************************************************************/ | |||||
PROLOGUE | |||||
fmov SUMF, REG0 | |||||
#if !defined(DOUBLE) | |||||
fmov s1, SUMF | |||||
#else | |||||
fmov d1, SUMF | |||||
#endif | |||||
cmp N, xzr | |||||
ble .Lsum_kernel_L999 | |||||
cmp INC_X, xzr | |||||
ble .Lsum_kernel_L999 | |||||
cmp INC_X, #1 | |||||
bne .Lsum_kernel_S_BEGIN | |||||
.Lsum_kernel_F_BEGIN: | |||||
asr I, N, #3 | |||||
cmp I, xzr | |||||
beq .Lsum_kernel_F1 | |||||
.Lsum_kernel_F8: | |||||
KERNEL_F8 | |||||
subs I, I, #1 | |||||
bne .Lsum_kernel_F8 | |||||
KERNEL_F8_FINALIZE | |||||
.Lsum_kernel_F1: | |||||
ands I, N, #7 | |||||
ble .Lsum_kernel_L999 | |||||
.Lsum_kernel_F10: | |||||
KERNEL_F1 | |||||
subs I, I, #1 | |||||
bne .Lsum_kernel_F10 | |||||
.Lsum_kernel_L999: | |||||
ret | |||||
.Lsum_kernel_S_BEGIN: | |||||
INIT_S | |||||
asr I, N, #2 | |||||
cmp I, xzr | |||||
ble .Lsum_kernel_S1 | |||||
.Lsum_kernel_S4: | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
subs I, I, #1 | |||||
bne .Lsum_kernel_S4 | |||||
.Lsum_kernel_S1: | |||||
ands I, N, #3 | |||||
ble .Lsum_kernel_L999 | |||||
.Lsum_kernel_S10: | |||||
KERNEL_S1 | |||||
subs I, I, #1 | |||||
bne .Lsum_kernel_S10 | |||||
ret | |||||
EPILOGUE |
@@ -0,0 +1,158 @@ | |||||
/******************************************************************************* | |||||
Copyright (c) 2015, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*******************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define N x0 /* vector length */ | |||||
#define X x1 /* X vector address */ | |||||
#define INC_X x2 /* X stride */ | |||||
#define I x5 /* loop variable */ | |||||
/******************************************************************************* | |||||
* Macro definitions | |||||
*******************************************************************************/ | |||||
#define REG0 xzr | |||||
#define SUMF d0 | |||||
#define TMPF d1 | |||||
#define TMPVF {v1.d}[0] | |||||
#define SZ 8 | |||||
/******************************************************************************/ | |||||
.macro KERNEL_F1 | |||||
ld1 {v1.2d}, [X], #16 | |||||
faddp TMPF, v1.2d | |||||
fadd SUMF, SUMF, TMPF | |||||
.endm | |||||
.macro KERNEL_F4 | |||||
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 | |||||
fadd v1.2d, v1.2d, v2.2d | |||||
fadd v3.2d, v3.2d, v4.2d | |||||
fadd v0.2d, v0.2d, v1.2d | |||||
fadd v0.2d, v0.2d, v3.2d | |||||
PRFM PLDL1KEEP, [X, #1024] | |||||
.endm | |||||
.macro KERNEL_F4_FINALIZE | |||||
faddp SUMF, v0.2d | |||||
.endm | |||||
.macro INIT_S | |||||
lsl INC_X, INC_X, #4 | |||||
.endm | |||||
.macro KERNEL_S1 | |||||
ld1 {v1.2d}, [X], INC_X | |||||
faddp TMPF, v1.2d | |||||
fadd SUMF, SUMF, TMPF | |||||
.endm | |||||
/******************************************************************************* | |||||
* End of macro definitions | |||||
*******************************************************************************/ | |||||
PROLOGUE | |||||
fmov SUMF, REG0 | |||||
cmp N, xzr | |||||
ble .Lzsum_kernel_L999 | |||||
cmp INC_X, xzr | |||||
ble .Lzsum_kernel_L999 | |||||
cmp INC_X, #1 | |||||
bne .Lzsum_kernel_S_BEGIN | |||||
.Lzsum_kernel_F_BEGIN: | |||||
asr I, N, #2 | |||||
cmp I, xzr | |||||
beq .Lzsum_kernel_F1 | |||||
.Lzsum_kernel_F4: | |||||
KERNEL_F4 | |||||
subs I, I, #1 | |||||
bne .Lzsum_kernel_F4 | |||||
KERNEL_F4_FINALIZE | |||||
.Lzsum_kernel_F1: | |||||
ands I, N, #3 | |||||
ble .Lzsum_kernel_L999 | |||||
.Lzsum_kernel_F10: | |||||
KERNEL_F1 | |||||
subs I, I, #1 | |||||
bne .Lzsum_kernel_F10 | |||||
.Lzsum_kernel_L999: | |||||
ret | |||||
.Lzsum_kernel_S_BEGIN: | |||||
INIT_S | |||||
asr I, N, #2 | |||||
cmp I, xzr | |||||
ble .Lzsum_kernel_S1 | |||||
.Lzsum_kernel_S4: | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
subs I, I, #1 | |||||
bne .Lzsum_kernel_S4 | |||||
.Lzsum_kernel_S1: | |||||
ands I, N, #3 | |||||
ble .Lzsum_kernel_L999 | |||||
.Lzsum_kernel_S10: | |||||
KERNEL_S1 | |||||
subs I, I, #1 | |||||
bne .Lzsum_kernel_S10 | |||||
ret | |||||
EPILOGUE |
@@ -60,6 +60,10 @@ CASUMKERNEL = asum.S | |||||
ZASUMKERNEL = asum.S | ZASUMKERNEL = asum.S | ||||
XASUMKERNEL = asum.S | XASUMKERNEL = asum.S | ||||
CSUMKERNEL = sum.S | |||||
ZSUMKERNEL = sum.S | |||||
XSUMKERNEL = sum.S | |||||
CNRM2KERNEL = nrm2.S | CNRM2KERNEL = nrm2.S | ||||
ZNRM2KERNEL = nrm2.S | ZNRM2KERNEL = nrm2.S | ||||
XNRM2KERNEL = nrm2.S | XNRM2KERNEL = nrm2.S | ||||
@@ -0,0 +1,358 @@ | |||||
/*********************************************************************/ | |||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
/* Copyright 2019, The OpenBLAS project */ | |||||
/* All rights reserved. */ | |||||
/* */ | |||||
/* Redistribution and use in source and binary forms, with or */ | |||||
/* without modification, are permitted provided that the following */ | |||||
/* conditions are met: */ | |||||
/* */ | |||||
/* 1. Redistributions of source code must retain the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer. */ | |||||
/* */ | |||||
/* 2. Redistributions in binary form must reproduce the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer in the documentation and/or other materials */ | |||||
/* provided with the distribution. */ | |||||
/* */ | |||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||||
/* */ | |||||
/* The views and conclusions contained in the software and */ | |||||
/* documentation are those of the authors and should not be */ | |||||
/* interpreted as representing official policies, either expressed */ | |||||
/* or implied, of The University of Texas at Austin. */ | |||||
/*********************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#ifdef XDOUBLE | |||||
#define PREFETCH_SIZE ( 8 * 16 + 4) | |||||
#elif defined(DOUBLE) | |||||
#define PREFETCH_SIZE (16 * 16 + 8) | |||||
#else | |||||
#define PREFETCH_SIZE (32 * 16 + 16) | |||||
#endif | |||||
#ifndef COMPLEX | |||||
#define COMPADD 0 | |||||
#define STRIDE INCX | |||||
#else | |||||
#define COMPADD 1 | |||||
#define STRIDE SIZE | |||||
#endif | |||||
#define PRE1 r2 | |||||
#define I r17 | |||||
#define J r18 | |||||
#define INCX16 r21 | |||||
#define PR r30 | |||||
#define ARLC r31 | |||||
#define N r32 | |||||
#define X r33 | |||||
#define INCX r34 | |||||
PROLOGUE | |||||
.prologue | |||||
PROFCODE | |||||
{ .mfi | |||||
adds PRE1 = PREFETCH_SIZE * SIZE, X | |||||
mov f8 = f0 | |||||
.save ar.lc, ARLC | |||||
mov ARLC = ar.lc | |||||
} | |||||
;; | |||||
.body | |||||
#ifdef F_INTERFACE | |||||
{ .mmi | |||||
LDINT N = [N] | |||||
LDINT INCX = [INCX] | |||||
nop.i 0 | |||||
} | |||||
;; | |||||
#ifndef USE64BITINT | |||||
{ .mii | |||||
nop.m 0 | |||||
sxt4 N = N | |||||
sxt4 INCX = INCX | |||||
} | |||||
;; | |||||
#endif | |||||
#endif | |||||
{ .mmi | |||||
cmp.lt p0, p6 = r0, INCX | |||||
cmp.lt p0, p7 = r0, N | |||||
shr I = N, (4 - COMPADD) | |||||
} | |||||
{ .mbb | |||||
and J = ((1 << (4 - COMPADD)) - 1), N | |||||
(p6) br.ret.sptk.many b0 | |||||
(p7) br.ret.sptk.many b0 | |||||
} | |||||
;; | |||||
{ .mfi | |||||
adds I = -1, I | |||||
mov f10 = f0 | |||||
mov PR = pr | |||||
} | |||||
{ .mfi | |||||
cmp.eq p9, p0 = r0, J | |||||
mov f9 = f0 | |||||
tbit.z p0, p12 = N, 3 - COMPADD | |||||
} | |||||
;; | |||||
{ .mmi | |||||
cmp.eq p16, p0 = r0, r0 | |||||
cmp.ne p17, p0 = r0, r0 | |||||
mov ar.ec= 3 | |||||
} | |||||
{ .mfi | |||||
cmp.ne p18, p0 = r0, r0 | |||||
mov f11 = f0 | |||||
shl INCX = INCX, BASE_SHIFT + COMPADD | |||||
} | |||||
;; | |||||
{ .mmi | |||||
#ifdef XDOUBLE | |||||
shladd INCX16 = INCX, (3 - COMPADD), r0 | |||||
#else | |||||
shladd INCX16 = INCX, (4 - COMPADD), r0 | |||||
#endif | |||||
cmp.ne p19, p0 = r0, r0 | |||||
mov ar.lc = I | |||||
} | |||||
{ .mmb | |||||
cmp.gt p8 ,p0 = r0, I | |||||
#ifdef COMPLEX | |||||
adds INCX = - SIZE, INCX | |||||
#else | |||||
nop.m 0 | |||||
#endif | |||||
(p8) br.cond.dpnt .L55 | |||||
} | |||||
;; | |||||
.align 32 | |||||
.L52: | |||||
{ .mmf | |||||
(p16) lfetch.nt1 [PRE1], INCX16 | |||||
(p16) LDFD f32 = [X], STRIDE | |||||
} | |||||
{ .mfb | |||||
(p19) FADD f8 = f8, f71 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f35 = [X], INCX | |||||
} | |||||
{ .mfb | |||||
(p19) FADD f9 = f9, f74 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f38 = [X], STRIDE | |||||
} | |||||
{ .mfb | |||||
(p19) FADD f10 = f10, f77 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f41 = [X], INCX | |||||
} | |||||
{ .mfb | |||||
(p19) FADD f11 = f11, f80 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f44 = [X], STRIDE | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f8 = f8, f34 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f47 = [X], INCX | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f9 = f9, f37 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f50 = [X], STRIDE | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f10 = f10, f40 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f53 = [X], INCX | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f11 = f11, f43 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
#ifdef XDOUBLE | |||||
(p16) lfetch.nt1 [PRE1], INCX16 | |||||
#endif | |||||
(p16) LDFD f56 = [X], STRIDE | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f8 = f8, f46 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f59 = [X], INCX | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f9 = f9, f49 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f62 = [X], STRIDE | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f10 = f10, f52 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f65 = [X], INCX | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f11 = f11, f55 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f68 = [X], STRIDE | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f8 = f8, f58 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f71 = [X], INCX | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f9 = f9, f61 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f74 = [X], STRIDE | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f10 = f10, f64 | |||||
} | |||||
;; | |||||
{ .mmf | |||||
(p16) LDFD f77 = [X], INCX | |||||
} | |||||
{ .mfb | |||||
(p18) FADD f11 = f11, f67 | |||||
br.ctop.sptk.few .L52 | |||||
} | |||||
;; | |||||
FADD f8 = f8, f71 | |||||
FADD f9 = f9, f74 | |||||
FADD f10 = f10, f77 | |||||
FADD f11 = f11, f80 | |||||
.align 32 | |||||
;; | |||||
.L55: | |||||
(p12) LDFD f32 = [X], STRIDE | |||||
(p9) br.cond.dptk .L998 | |||||
;; | |||||
(p12) LDFD f33 = [X], INCX | |||||
;; | |||||
(p12) LDFD f34 = [X], STRIDE | |||||
;; | |||||
(p12) LDFD f35 = [X], INCX | |||||
tbit.z p0, p13 = N, (2 - COMPADD) | |||||
;; | |||||
(p12) LDFD f36 = [X], STRIDE | |||||
tbit.z p0, p14 = N, (1 - COMPADD) | |||||
;; | |||||
(p12) LDFD f37 = [X], INCX | |||||
#ifndef COMPLEX | |||||
tbit.z p0, p15 = N, 0 | |||||
#endif | |||||
;; | |||||
(p12) LDFD f38 = [X], STRIDE | |||||
;; | |||||
(p12) LDFD f39 = [X], INCX | |||||
;; | |||||
(p13) LDFD f40 = [X], STRIDE | |||||
;; | |||||
(p13) LDFD f41 = [X], INCX | |||||
;; | |||||
(p13) LDFD f42 = [X], STRIDE | |||||
(p12) FADD f8 = f8, f32 | |||||
;; | |||||
(p13) LDFD f43 = [X], INCX | |||||
(p12) FADD f9 = f9, f33 | |||||
;; | |||||
(p14) LDFD f44 = [X], STRIDE | |||||
(p12) FADD f10 = f10, f34 | |||||
;; | |||||
(p14) LDFD f45 = [X], INCX | |||||
(p12) FADD f11 = f11, f35 | |||||
;; | |||||
#ifndef COMPLEX | |||||
(p15) LDFD f46 = [X] | |||||
#endif | |||||
(p12) FADD f8 = f8, f36 | |||||
;; | |||||
(p12) FADD f9 = f9, f37 | |||||
(p12) FADD f10 = f10, f38 | |||||
(p12) FADD f11 = f11, f39 | |||||
;; | |||||
(p13) FADD f8 = f8, f40 | |||||
(p13) FADD f9 = f9, f41 | |||||
#ifndef COMPLEX | |||||
#endif | |||||
(p13) FADD f10 = f10, f42 | |||||
;; | |||||
(p13) FADD f11 = f11, f43 | |||||
(p14) FADD f8 = f8, f44 | |||||
(p14) FADD f9 = f9, f45 | |||||
#ifndef COMPLEX | |||||
(p15) FADD f10 = f10, f46 | |||||
#endif | |||||
;; | |||||
.align 32 | |||||
.L998: | |||||
{ .mfi | |||||
FADD f8 = f8, f9 | |||||
mov ar.lc = ARLC | |||||
} | |||||
{ .mmf | |||||
FADD f10 = f10, f11 | |||||
} | |||||
;; | |||||
{ .mii | |||||
mov pr = PR, -65474 | |||||
} | |||||
;; | |||||
{ .mfb | |||||
FADD f8 = f8, f10 | |||||
br.ret.sptk.many b0 | |||||
} | |||||
EPILOGUE |
@@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c | |||||
ISMINKERNEL = ../mips/imin.c | ISMINKERNEL = ../mips/imin.c | ||||
IDMINKERNEL = ../mips/imin.c | IDMINKERNEL = ../mips/imin.c | ||||
SSUMKERNEL = ../mips/sum.c | |||||
DSUMKERNEL = ../mips/sum.c | |||||
CSUMKERNEL = ../mips/zsum.c | |||||
ZSUMKERNEL = ../mips/zsum.c | |||||
ifdef HAVE_MSA | ifdef HAVE_MSA | ||||
SASUMKERNEL = ../mips/sasum_msa.c | SASUMKERNEL = ../mips/sasum_msa.c | ||||
DASUMKERNEL = ../mips/dasum_msa.c | DASUMKERNEL = ../mips/dasum_msa.c | ||||
@@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
while(i < n) | while(i < n) | ||||
{ | { | ||||
if( x[ix] > minf ) | |||||
if( x[ix] < minf ) | |||||
{ | { | ||||
min = i; | min = i; | ||||
minf = x[ix]; | minf = x[ix]; | ||||
@@ -0,0 +1,47 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2016, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | |||||
BLASLONG i=0; | |||||
FLOAT sumf = 0.0; | |||||
if (n <= 0 || inc_x <= 0) return(sumf); | |||||
n *= inc_x; | |||||
while(i < n) | |||||
{ | |||||
sumf += x[i]; | |||||
i += inc_x; | |||||
} | |||||
return(sumf); | |||||
} | |||||
@@ -0,0 +1,52 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2016, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <math.h> | |||||
#define CSUM1(x,i) x[i]+x[i+1] | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
{ | |||||
BLASLONG i=0; | |||||
FLOAT sumf = 0.0; | |||||
BLASLONG inc_x2; | |||||
if (n <= 0 || inc_x <= 0) return(sumf); | |||||
inc_x2 = 2 * inc_x; | |||||
n *= inc_x2; | |||||
while(i < n) | |||||
{ | |||||
sumf += CSUM1(x,i); | |||||
i += inc_x2; | |||||
} | |||||
return(sumf); | |||||
} | |||||
@@ -0,0 +1,332 @@ | |||||
/*********************************************************************/ | |||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
/* All rights reserved. */ | |||||
/* */ | |||||
/* Redistribution and use in source and binary forms, with or */ | |||||
/* without modification, are permitted provided that the following */ | |||||
/* conditions are met: */ | |||||
/* */ | |||||
/* 1. Redistributions of source code must retain the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer. */ | |||||
/* */ | |||||
/* 2. Redistributions in binary form must reproduce the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer in the documentation and/or other materials */ | |||||
/* provided with the distribution. */ | |||||
/* */ | |||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||||
/* */ | |||||
/* The views and conclusions contained in the software and */ | |||||
/* documentation are those of the authors and should not be */ | |||||
/* interpreted as representing official policies, either expressed */ | |||||
/* or implied, of The University of Texas at Austin. */ | |||||
/*********************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define N $4 | |||||
#define X $5 | |||||
#define INCX $6 | |||||
#define I $2 | |||||
#define TEMP $3 | |||||
#define a1 $f2 | |||||
#define a2 $f3 | |||||
#define a3 $f4 | |||||
#define a4 $f5 | |||||
#define a5 $f6 | |||||
#define a6 $f7 | |||||
#define a7 $f8 | |||||
#define a8 $f9 | |||||
#define t1 $f10 | |||||
#define t2 $f11 | |||||
#define t3 $f12 | |||||
#define t4 $f13 | |||||
#define s1 $f0 | |||||
#define s2 $f1 | |||||
PROLOGUE | |||||
#ifdef F_INTERFACE | |||||
LDINT N, 0(N) | |||||
LDINT INCX, 0(INCX) | |||||
#endif | |||||
MTC $0, s1 | |||||
MTC $0, s2 | |||||
dsll INCX, INCX, BASE_SHIFT | |||||
blez N, .L999 | |||||
li TEMP, SIZE | |||||
bne INCX, TEMP, .L20 | |||||
dsra I, N, 3 | |||||
blez I, .L15 | |||||
NOP | |||||
LD a1, 0 * SIZE(X) | |||||
LD a2, 1 * SIZE(X) | |||||
LD a3, 2 * SIZE(X) | |||||
LD a4, 3 * SIZE(X) | |||||
LD a5, 4 * SIZE(X) | |||||
MOV t1, a1 | |||||
LD a6, 5 * SIZE(X) | |||||
MOV t2, a2 | |||||
LD a7, 6 * SIZE(X) | |||||
MOV t3, a3 | |||||
MOV t4, a4 | |||||
daddiu I, I, -1 | |||||
blez I, .L13 | |||||
LD a8, 7 * SIZE(X) | |||||
.align 3 | |||||
.L12: | |||||
ADD s1, s1, t1 | |||||
LD a1, 8 * SIZE(X) | |||||
MOV t1, a5 | |||||
daddiu I, I, -1 | |||||
ADD s2, s2, t2 | |||||
LD a2, 9 * SIZE(X) | |||||
MOV t2, a6 | |||||
NOP | |||||
ADD s1, s1, t3 | |||||
LD a3, 10 * SIZE(X) | |||||
MOV t3, a7 | |||||
NOP | |||||
ADD s2, s2, t4 | |||||
LD a4, 11 * SIZE(X) | |||||
MOV t4, a8 | |||||
daddiu X, X, 8 * SIZE | |||||
ADD s1, s1, t1 | |||||
LD a5, 4 * SIZE(X) | |||||
MOV t1, a1 | |||||
NOP | |||||
ADD s2, s2, t2 | |||||
LD a6, 5 * SIZE(X) | |||||
MOV t2, a2 | |||||
NOP | |||||
ADD s1, s1, t3 | |||||
LD a7, 6 * SIZE(X) | |||||
MOV t3, a3 | |||||
NOP | |||||
ADD s2, s2, t4 | |||||
LD a8, 7 * SIZE(X) | |||||
bgtz I, .L12 | |||||
MOV t4, a4 | |||||
.align 3 | |||||
.L13: | |||||
ADD s1, s1, t1 | |||||
daddiu X, X, 8 * SIZE | |||||
MOV t1, a5 | |||||
NOP | |||||
ADD s2, s2, t2 | |||||
MOV t2, a6 | |||||
ADD s1, s1, t3 | |||||
MOV t3, a7 | |||||
ADD s2, s2, t4 | |||||
MOV t4, a8 | |||||
ADD s1, s1, t1 | |||||
ADD s2, s2, t2 | |||||
ADD s1, s1, t3 | |||||
ADD s2, s2, t4 | |||||
.align 3 | |||||
.L15: | |||||
andi I, N, 7 | |||||
blez I, .L999 | |||||
NOP | |||||
.align 3 | |||||
.L16: | |||||
LD a1, 0 * SIZE(X) | |||||
daddiu I, I, -1 | |||||
MOV t1, a1 | |||||
ADD s1, s1, t1 | |||||
bgtz I, .L16 | |||||
daddiu X, X, SIZE | |||||
j .L999 | |||||
NOP | |||||
.align 3 | |||||
.L20: | |||||
blez I, .L25 | |||||
NOP | |||||
LD a1, 0 * SIZE(X) | |||||
daddu X, X, INCX | |||||
LD a2, 0 * SIZE(X) | |||||
daddu X, X, INCX | |||||
LD a3, 0 * SIZE(X) | |||||
daddu X, X, INCX | |||||
LD a4, 0 * SIZE(X) | |||||
daddu X, X, INCX | |||||
LD a5, 0 * SIZE(X) | |||||
daddu X, X, INCX | |||||
LD a6, 0 * SIZE(X) | |||||
daddu X, X, INCX | |||||
MOV t1, a1 | |||||
LD a7, 0 * SIZE(X) | |||||
MOV t2, a2 | |||||
daddu X, X, INCX | |||||
MOV t3, a3 | |||||
LD a8, 0 * SIZE(X) | |||||
MOV t4, a4 | |||||
daddiu I, I, -1 | |||||
blez I, .L24 | |||||
daddu X, X, INCX | |||||
.align 3 | |||||
.L23: | |||||
ADD s1, s1, t1 | |||||
LD a1, 0 * SIZE(X) | |||||
MOV t1, a5 | |||||
daddu X, X, INCX | |||||
ADD s2, s2, t2 | |||||
LD a2, 0 * SIZE(X) | |||||
MOV t2, a6 | |||||
daddu X, X, INCX | |||||
ADD s1, s1, t3 | |||||
LD a3, 0 * SIZE(X) | |||||
MOV t3, a7 | |||||
daddu X, X, INCX | |||||
ADD s2, s2, t4 | |||||
LD a4, 0 * SIZE(X) | |||||
MOV t4, a8 | |||||
daddu X, X, INCX | |||||
ADD s1, s1, t1 | |||||
LD a5, 0 * SIZE(X) | |||||
MOV t1, a1 | |||||
daddu X, X, INCX | |||||
ADD s2, s2, t2 | |||||
LD a6, 0 * SIZE(X) | |||||
MOV t2, a2 | |||||
daddu X, X, INCX | |||||
ADD s1, s1, t3 | |||||
LD a7, 0 * SIZE(X) | |||||
MOV t3, a3 | |||||
daddu X, X, INCX | |||||
ADD s2, s2, t4 | |||||
LD a8, 0 * SIZE(X) | |||||
MOV t4, a4 | |||||
daddiu I, I, -1 | |||||
bgtz I, .L23 | |||||
daddu X, X, INCX | |||||
.align 3 | |||||
.L24: | |||||
ADD s1, s1, t1 | |||||
MOV t1, a5 | |||||
ADD s2, s2, t2 | |||||
MOV t2, a6 | |||||
ADD s1, s1, t3 | |||||
MOV t3, a7 | |||||
ADD s2, s2, t4 | |||||
MOV t4, a8 | |||||
ADD s1, s1, t1 | |||||
ADD s2, s2, t2 | |||||
ADD s1, s1, t3 | |||||
ADD s2, s2, t4 | |||||
.align 3 | |||||
.L25: | |||||
andi I, N, 7 | |||||
blez I, .L999 | |||||
NOP | |||||
.align 3 | |||||
.L26: | |||||
LD a1, 0 * SIZE(X) | |||||
daddiu I, I, -1 | |||||
MOV t1, a1 | |||||
daddu X, X, INCX | |||||
bgtz I, .L26 | |||||
ADD s1, s1, t1 | |||||
.align 3 | |||||
.L999: | |||||
j $31 | |||||
ADD s1, s1, s2 | |||||
EPILOGUE |
@@ -0,0 +1,204 @@ | |||||
/*********************************************************************/ | |||||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
/* All rights reserved. */ | |||||
/* */ | |||||
/* Redistribution and use in source and binary forms, with or */ | |||||
/* without modification, are permitted provided that the following */ | |||||
/* conditions are met: */ | |||||
/* */ | |||||
/* 1. Redistributions of source code must retain the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer. */ | |||||
/* */ | |||||
/* 2. Redistributions in binary form must reproduce the above */ | |||||
/* copyright notice, this list of conditions and the following */ | |||||
/* disclaimer in the documentation and/or other materials */ | |||||
/* provided with the distribution. */ | |||||
/* */ | |||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||||
/* */ | |||||
/* The views and conclusions contained in the software and */ | |||||
/* documentation are those of the authors and should not be */ | |||||
/* interpreted as representing official policies, either expressed */ | |||||
/* or implied, of The University of Texas at Austin. */ | |||||
/*********************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define N $4 | |||||
#define X $5 | |||||
#define INCX $6 | |||||
#define I $2 | |||||
#define TEMP $3 | |||||
#define a1 $f2 | |||||
#define a2 $f3 | |||||
#define a3 $f4 | |||||
#define a4 $f5 | |||||
#define a5 $f6 | |||||
#define a6 $f7 | |||||
#define a7 $f8 | |||||
#define a8 $f9 | |||||
#define t1 $f10 | |||||
#define t2 $f11 | |||||
#define t3 $f12 | |||||
#define t4 $f13 | |||||
#define s1 $f0 | |||||
#define s2 $f1 | |||||
PROLOGUE | |||||
#ifdef F_INTERFACE | |||||
LDINT N, 0(N) | |||||
LDINT INCX, 0(INCX) | |||||
#endif | |||||
MTC $0, s1 | |||||
MTC $0, s2 | |||||
dsll INCX, INCX, ZBASE_SHIFT | |||||
blez N, .L999 | |||||
dsra I, N, 2 | |||||
blez I, .L25 | |||||
NOP | |||||
LD a1, 0 * SIZE(X) | |||||
LD a2, 1 * SIZE(X) | |||||
daddu X, X, INCX | |||||
LD a3, 0 * SIZE(X) | |||||
LD a4, 1 * SIZE(X) | |||||
daddu X, X, INCX | |||||
LD a5, 0 * SIZE(X) | |||||
LD a6, 1 * SIZE(X) | |||||
daddu X, X, INCX | |||||
MOV t1, a1 | |||||
MOV t2, a2 | |||||
LD a7, 0 * SIZE(X) | |||||
LD a8, 1 * SIZE(X) | |||||
MOV t3, a3 | |||||
MOV t4, a4 | |||||
daddiu I, I, -1 | |||||
blez I, .L24 | |||||
daddu X, X, INCX | |||||
.align 3 | |||||
.L23: | |||||
ADD s1, s1, t1 | |||||
LD a1, 0 * SIZE(X) | |||||
MOV t1, a5 | |||||
daddiu I, I, -1 | |||||
ADD s2, s2, t2 | |||||
LD a2, 1 * SIZE(X) | |||||
MOV t2, a6 | |||||
daddu X, X, INCX | |||||
ADD s1, s1, t3 | |||||
LD a3, 0 * SIZE(X) | |||||
MOV t3, a7 | |||||
NOP | |||||
ADD s2, s2, t4 | |||||
LD a4, 1 * SIZE(X) | |||||
MOV t4, a8 | |||||
daddu X, X, INCX | |||||
ADD s1, s1, t1 | |||||
LD a5, 0 * SIZE(X) | |||||
MOV t1, a1 | |||||
NOP | |||||
ADD s2, s2, t2 | |||||
LD a6, 1 * SIZE(X) | |||||
MOV t2, a2 | |||||
daddu X, X, INCX | |||||
ADD s1, s1, t3 | |||||
LD a7, 0 * SIZE(X) | |||||
MOV t3, a3 | |||||
LD a8, 1 * SIZE(X) | |||||
ADD s2, s2, t4 | |||||
daddu X, X, INCX | |||||
bgtz I, .L23 | |||||
MOV t4, a4 | |||||
.align 3 | |||||
.L24: | |||||
ADD s1, s1, t1 | |||||
MOV t1, a5 | |||||
ADD s2, s2, t2 | |||||
MOV t2, a6 | |||||
ADD s1, s1, t3 | |||||
MOV t3, a7 | |||||
ADD s2, s2, t4 | |||||
MOV t4, a8 | |||||
ADD s1, s1, t1 | |||||
ADD s2, s2, t2 | |||||
ADD s1, s1, t3 | |||||
ADD s2, s2, t4 | |||||
.align 3 | |||||
.L25: | |||||
andi I, N, 3 | |||||
blez I, .L999 | |||||
NOP | |||||
.align 3 | |||||
.L26: | |||||
LD a1, 0 * SIZE(X) | |||||
LD a2, 1 * SIZE(X) | |||||
MOV t1, a1 | |||||
daddiu I, I, -1 | |||||
MOV t2, a2 | |||||
daddu X, X, INCX | |||||
ADD s1, s1, t1 | |||||
bgtz I, .L26 | |||||
ADD s2, s2, t2 | |||||
.align 3 | |||||
.L999: | |||||
j $31 | |||||
ADD s1, s1, s2 | |||||
EPILOGUE |
@@ -13,40 +13,40 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S | SGEMMITCOPY = sgemm_tcopy_16_power8.S | ||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c | SGEMMONCOPY = ../generic/gemm_ncopy_8.c | ||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S | SGEMMOTCOPY = sgemm_tcopy_8_power8.S | ||||
SGEMMINCOPYOBJ = sgemm_incopy.o | |||||
SGEMMITCOPYOBJ = sgemm_itcopy.o | |||||
SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMKERNEL = dgemm_kernel_16x4_power8.S | DGEMMKERNEL = dgemm_kernel_16x4_power8.S | ||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c | DGEMMINCOPY = ../generic/gemm_ncopy_16.c | ||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S | DGEMMITCOPY = dgemm_tcopy_16_power8.S | ||||
DGEMMONCOPY = dgemm_ncopy_4_power8.S | DGEMMONCOPY = dgemm_ncopy_4_power8.S | ||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | ||||
DGEMMINCOPYOBJ = dgemm_incopy.o | |||||
DGEMMITCOPYOBJ = dgemm_itcopy.o | |||||
DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S | CGEMMKERNEL = cgemm_kernel_8x4_power8.S | ||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ||||
CGEMMITCOPY = cgemm_tcopy_8_power8.S | CGEMMITCOPY = cgemm_tcopy_8_power8.S | ||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | ||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | ||||
CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
CGEMMINCOPYOBJ = cgemm_incopy.o | |||||
CGEMMITCOPYOBJ = cgemm_itcopy.o | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | ||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ||||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S | ZGEMMITCOPY = zgemm_tcopy_8_power8.S | ||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
ZGEMMINCOPYOBJ = zgemm_incopy.o | |||||
ZGEMMITCOPYOBJ = zgemm_itcopy.o | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
@@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
#SMINKERNEL = ../arm/min.c | #SMINKERNEL = ../arm/min.c | ||||
#DMINKERNEL = ../arm/min.c | #DMINKERNEL = ../arm/min.c | ||||
# | # | ||||
#ISAMAXKERNEL = ../arm/iamax.c | |||||
ISAMAXKERNEL = isamax.c | |||||
IDAMAXKERNEL = idamax.c | IDAMAXKERNEL = idamax.c | ||||
#ICAMAXKERNEL = ../arm/izamax.c | |||||
IZAMAXKERNEL = izamax.c | |||||
ICAMAXKERNEL = icamax.c | |||||
IZAMAXKERNEL = izamax.c | |||||
# | # | ||||
#ISAMINKERNEL = ../arm/iamin.c | |||||
IDAMINKERNEL = idamin.c | |||||
#ICAMINKERNEL = ../arm/izamin.c | |||||
ISAMINKERNEL = isamin.c | |||||
IDAMINKERNEL = idamin.c | |||||
ICAMINKERNEL = icamin.c | |||||
IZAMINKERNEL = izamin.c | IZAMINKERNEL = izamin.c | ||||
# | # | ||||
#ISMAXKERNEL = ../arm/imax.c | #ISMAXKERNEL = ../arm/imax.c | ||||
@@ -110,9 +110,9 @@ DASUMKERNEL = dasum.c | |||||
CASUMKERNEL = casum.c | CASUMKERNEL = casum.c | ||||
ZASUMKERNEL = zasum.c | ZASUMKERNEL = zasum.c | ||||
# | # | ||||
#SAXPYKERNEL = ../arm/axpy.c | |||||
SAXPYKERNEL = saxpy.c | |||||
DAXPYKERNEL = daxpy.c | DAXPYKERNEL = daxpy.c | ||||
#CAXPYKERNEL = ../arm/zaxpy.c | |||||
CAXPYKERNEL = caxpy.c | |||||
ZAXPYKERNEL = zaxpy.c | ZAXPYKERNEL = zaxpy.c | ||||
# | # | ||||
SCOPYKERNEL = scopy.c | SCOPYKERNEL = scopy.c | ||||
@@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c | |||||
SDOTKERNEL = sdot.c | SDOTKERNEL = sdot.c | ||||
DDOTKERNEL = ddot.c | DDOTKERNEL = ddot.c | ||||
DSDOTKERNEL = sdot.c | DSDOTKERNEL = sdot.c | ||||
#CDOTKERNEL = ../arm/zdot.c | |||||
CDOTKERNEL = cdot.c | |||||
ZDOTKERNEL = zdot.c | ZDOTKERNEL = zdot.c | ||||
# | # | ||||
SNRM2KERNEL = ../arm/nrm2.c | SNRM2KERNEL = ../arm/nrm2.c | ||||
@@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c | |||||
# | # | ||||
SROTKERNEL = srot.c | SROTKERNEL = srot.c | ||||
DROTKERNEL = drot.c | DROTKERNEL = drot.c | ||||
CROTKERNEL = zrot.c | |||||
CROTKERNEL = crot.c | |||||
ZROTKERNEL = zrot.c | ZROTKERNEL = zrot.c | ||||
# | # | ||||
SSCALKERNEL = sscal.c | SSCALKERNEL = sscal.c | ||||
@@ -147,14 +147,14 @@ CSWAPKERNEL = cswap.c | |||||
ZSWAPKERNEL = zswap.c | ZSWAPKERNEL = zswap.c | ||||
# | # | ||||
#SGEMVNKERNEL = ../arm/gemv_n.c | |||||
SGEMVNKERNEL = sgemv_n.c | |||||
DGEMVNKERNEL = dgemv_n.c | DGEMVNKERNEL = dgemv_n.c | ||||
#CGEMVNKERNEL = ../arm/zgemv_n.c | |||||
CGEMVNKERNEL = cgemv_n.c | |||||
ZGEMVNKERNEL = zgemv_n_4.c | ZGEMVNKERNEL = zgemv_n_4.c | ||||
# | # | ||||
#SGEMVTKERNEL = ../arm/gemv_t.c | |||||
SGEMVTKERNEL = sgemv_t.c | |||||
DGEMVTKERNEL = dgemv_t.c | DGEMVTKERNEL = dgemv_t.c | ||||
#CGEMVTKERNEL = ../arm/zgemv_t.c | |||||
CGEMVTKERNEL = cgemv_t.c | |||||
ZGEMVTKERNEL = zgemv_t_4.c | ZGEMVTKERNEL = zgemv_t_4.c | ||||
@@ -0,0 +1,184 @@ | |||||
#SGEMM_BETA = ../generic/gemm_beta.c | |||||
#DGEMM_BETA = ../generic/gemm_beta.c | |||||
#CGEMM_BETA = ../generic/zgemm_beta.c | |||||
#ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
STRMMKERNEL = strmm_kernel_16x8_power8.S | |||||
DTRMMKERNEL = dgemm_kernel_power9.S | |||||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||||
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||||
SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMKERNEL = dgemm_kernel_power9.S | |||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||||
DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||||
CGEMMITCOPY = cgemm_tcopy_8_power8.S | |||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. | |||||
#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S | |||||
#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S | |||||
#Pure C for other kernels | |||||
#SAMAXKERNEL = ../arm/amax.c | |||||
#DAMAXKERNEL = ../arm/amax.c | |||||
#CAMAXKERNEL = ../arm/zamax.c | |||||
#ZAMAXKERNEL = ../arm/zamax.c | |||||
# | |||||
#SAMINKERNEL = ../arm/amin.c | |||||
#DAMINKERNEL = ../arm/amin.c | |||||
#CAMINKERNEL = ../arm/zamin.c | |||||
#ZAMINKERNEL = ../arm/zamin.c | |||||
# | |||||
#SMAXKERNEL = ../arm/max.c | |||||
#DMAXKERNEL = ../arm/max.c | |||||
# | |||||
#SMINKERNEL = ../arm/min.c | |||||
#DMINKERNEL = ../arm/min.c | |||||
# | |||||
ISAMAXKERNEL = isamax.c | |||||
IDAMAXKERNEL = idamax.c | |||||
ICAMAXKERNEL = icamax.c | |||||
IZAMAXKERNEL = izamax.c | |||||
# | |||||
ISAMINKERNEL = isamin.c | |||||
IDAMINKERNEL = idamin.c | |||||
ICAMINKERNEL = icamin.c | |||||
IZAMINKERNEL = izamin.c | |||||
# | |||||
#ISMAXKERNEL = ../arm/imax.c | |||||
#IDMAXKERNEL = ../arm/imax.c | |||||
# | |||||
#ISMINKERNEL = ../arm/imin.c | |||||
#IDMINKERNEL = ../arm/imin.c | |||||
# | |||||
SASUMKERNEL = sasum.c | |||||
DASUMKERNEL = dasum.c | |||||
CASUMKERNEL = casum.c | |||||
ZASUMKERNEL = zasum.c | |||||
# | |||||
SAXPYKERNEL = saxpy.c | |||||
DAXPYKERNEL = daxpy.c | |||||
CAXPYKERNEL = caxpy.c | |||||
ZAXPYKERNEL = zaxpy.c | |||||
# | |||||
SCOPYKERNEL = scopy.c | |||||
DCOPYKERNEL = dcopy.c | |||||
CCOPYKERNEL = ccopy.c | |||||
ZCOPYKERNEL = zcopy.c | |||||
# | |||||
SDOTKERNEL = sdot.c | |||||
DDOTKERNEL = ddot.c | |||||
DSDOTKERNEL = sdot.c | |||||
CDOTKERNEL = cdot.c | |||||
ZDOTKERNEL = zdot.c | |||||
# | |||||
SNRM2KERNEL = ../arm/nrm2.c | |||||
DNRM2KERNEL = ../arm/nrm2.c | |||||
CNRM2KERNEL = ../arm/znrm2.c | |||||
ZNRM2KERNEL = ../arm/znrm2.c | |||||
# | |||||
SROTKERNEL = srot.c | |||||
DROTKERNEL = drot.c | |||||
CROTKERNEL = crot.c | |||||
ZROTKERNEL = zrot.c | |||||
# | |||||
SSCALKERNEL = sscal.c | |||||
DSCALKERNEL = dscal.c | |||||
CSCALKERNEL = zscal.c | |||||
ZSCALKERNEL = zscal.c | |||||
# | |||||
SSWAPKERNEL = sswap.c | |||||
DSWAPKERNEL = dswap.c | |||||
CSWAPKERNEL = cswap.c | |||||
ZSWAPKERNEL = zswap.c | |||||
# | |||||
SGEMVNKERNEL = sgemv_n.c | |||||
DGEMVNKERNEL = dgemv_n.c | |||||
CGEMVNKERNEL = cgemv_n.c | |||||
ZGEMVNKERNEL = zgemv_n_4.c | |||||
# | |||||
SGEMVTKERNEL = sgemv_t.c | |||||
DGEMVTKERNEL = dgemv_t.c | |||||
CGEMVTKERNEL = cgemv_t.c | |||||
ZGEMVTKERNEL = zgemv_t_4.c | |||||
#SSYMV_U_KERNEL = ../generic/symv_k.c | |||||
#SSYMV_L_KERNEL = ../generic/symv_k.c | |||||
#DSYMV_U_KERNEL = ../generic/symv_k.c | |||||
#DSYMV_L_KERNEL = ../generic/symv_k.c | |||||
#QSYMV_U_KERNEL = ../generic/symv_k.c | |||||
#QSYMV_L_KERNEL = ../generic/symv_k.c | |||||
#CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
#CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
#ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
#ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
#XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
#XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
#ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||||
#ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||||
LSAME_KERNEL = ../generic/lsame.c | |||||
SCABS_KERNEL = ../generic/cabs.c | |||||
DCABS_KERNEL = ../generic/cabs.c | |||||
QCABS_KERNEL = ../generic/cabs.c | |||||
#Dump kernel | |||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c |
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
#if defined(POWER8) | |||||
#if defined(POWER8) || defined(POWER9) | |||||
#include "casum_microk_power8.c" | #include "casum_microk_power8.c" | ||||
#endif | #endif | ||||
@@ -0,0 +1,145 @@ | |||||
/* | |||||
Copyright (c) 2013-2018, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#ifndef HAVE_ASM_KERNEL | |||||
#include <altivec.h> | |||||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) | |||||
{ | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r}; | |||||
register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i}; | |||||
#else | |||||
register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r}; | |||||
register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; | |||||
#endif | |||||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
register __vector float *vy = (__vector float *) y; | |||||
register __vector float *vx = (__vector float *) x; | |||||
BLASLONG i=0; | |||||
for (; i < n/2; i += 8) { | |||||
register __vector float vy_0 = vy[i]; | |||||
register __vector float vy_1 = vy[i + 1]; | |||||
register __vector float vy_2 = vy[i + 2]; | |||||
register __vector float vy_3 = vy[i + 3]; | |||||
register __vector float vy_4 = vy[i + 4]; | |||||
register __vector float vy_5 = vy[i + 5]; | |||||
register __vector float vy_6 = vy[i + 6]; | |||||
register __vector float vy_7 = vy[i + 7]; | |||||
register __vector float vx_0 = vx[i]; | |||||
register __vector float vx_1 = vx[i + 1]; | |||||
register __vector float vx_2 = vx[i + 2]; | |||||
register __vector float vx_3 = vx[i + 3]; | |||||
register __vector float vx_4 = vx[i + 4]; | |||||
register __vector float vx_5 = vx[i + 5]; | |||||
register __vector float vx_6 = vx[i + 6]; | |||||
register __vector float vx_7 = vx[i + 7]; | |||||
vy_0 += vx_0*valpha_r; | |||||
vy_1 += vx_1*valpha_r; | |||||
vy_2 += vx_2*valpha_r; | |||||
vy_3 += vx_3*valpha_r; | |||||
vy_4 += vx_4*valpha_r; | |||||
vy_5 += vx_5*valpha_r; | |||||
vy_6 += vx_6*valpha_r; | |||||
vy_7 += vx_7*valpha_r; | |||||
vx_0 = vec_perm(vx_0, vx_0, swap_mask); | |||||
vx_1 = vec_perm(vx_1, vx_1, swap_mask); | |||||
vx_2 = vec_perm(vx_2, vx_2, swap_mask); | |||||
vx_3 = vec_perm(vx_3, vx_3, swap_mask); | |||||
vx_4 = vec_perm(vx_4, vx_4, swap_mask); | |||||
vx_5 = vec_perm(vx_5, vx_5, swap_mask); | |||||
vx_6 = vec_perm(vx_6, vx_6, swap_mask); | |||||
vx_7 = vec_perm(vx_7, vx_7, swap_mask); | |||||
vy_0 += vx_0*valpha_i; | |||||
vy_1 += vx_1*valpha_i; | |||||
vy_2 += vx_2*valpha_i; | |||||
vy_3 += vx_3*valpha_i; | |||||
vy_4 += vx_4*valpha_i; | |||||
vy_5 += vx_5*valpha_i; | |||||
vy_6 += vx_6*valpha_i; | |||||
vy_7 += vx_7*valpha_i; | |||||
vy[i] = vy_0; | |||||
vy[i + 1] = vy_1; | |||||
vy[i + 2] = vy_2; | |||||
vy[i + 3] = vy_3; | |||||
vy[i + 4] = vy_4; | |||||
vy[i + 5] = vy_5 ; | |||||
vy[i + 6] = vy_6 ; | |||||
vy[i + 7] = vy_7 ; | |||||
} | |||||
} | |||||
#endif | |||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||||
BLASLONG i = 0; | |||||
BLASLONG ix = 0, iy = 0; | |||||
if (n <= 0) return (0); | |||||
if ((inc_x == 1) && (inc_y == 1)) { | |||||
BLASLONG n1 = n & -16; | |||||
if (n1) { | |||||
caxpy_kernel_16(n1, x, y, da_r,da_i); | |||||
ix = 2 * n1; | |||||
} | |||||
i = n1; | |||||
while (i < n) { | |||||
#if !defined(CONJ) | |||||
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
#else | |||||
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
#endif | |||||
i++; | |||||
ix += 2; | |||||
} | |||||
return (0); | |||||
} | |||||
inc_x *= 2; | |||||
inc_y *= 2; | |||||
while (i < n) { | |||||
#if !defined(CONJ) | |||||
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); | |||||
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||||
#else | |||||
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); | |||||
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||||
#endif | |||||
ix += inc_x; | |||||
iy += inc_y; | |||||
i++; | |||||
} | |||||
return (0); | |||||
} | |||||
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if defined(POWER8) | |||||
#if defined(POWER8) || defined(POWER9) | |||||
#include "ccopy_microk_power8.c" | #include "ccopy_microk_power8.c" | ||||
#endif | #endif | ||||
@@ -0,0 +1,164 @@ | |||||
/*Copyright (c) 2013-201\n8, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#ifndef HAVE_KERNEL_8 | |||||
#include <altivec.h> | |||||
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | |||||
{ | |||||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
register __vector float *vy = (__vector float *) y; | |||||
register __vector float *vx = (__vector float *) x; | |||||
BLASLONG i = 0; | |||||
register __vector float vd_0 = { 0 }; | |||||
register __vector float vd_1 = { 0 }; | |||||
register __vector float vd_2 = { 0 }; | |||||
register __vector float vd_3 = { 0 }; | |||||
register __vector float vdd_0 = { 0 }; | |||||
register __vector float vdd_1 = { 0 }; | |||||
register __vector float vdd_2 = { 0 }; | |||||
register __vector float vdd_3 = { 0 }; | |||||
for (; i < n/2; i += 4) { | |||||
register __vector float vyy_0 ; | |||||
register __vector float vyy_1 ; | |||||
register __vector float vyy_2 ; | |||||
register __vector float vyy_3 ; | |||||
register __vector float vy_0 = vy[i]; | |||||
register __vector float vy_1 = vy[i + 1]; | |||||
register __vector float vy_2 = vy[i + 2]; | |||||
register __vector float vy_3 = vy[i + 3]; | |||||
register __vector float vx_0= vx[i]; | |||||
register __vector float vx_1 = vx[i + 1]; | |||||
register __vector float vx_2 = vx[i + 2]; | |||||
register __vector float vx_3 = vx[i + 3]; | |||||
vyy_0 = vec_perm(vy_0, vy_0, swap_mask); | |||||
vyy_1 = vec_perm(vy_1, vy_1, swap_mask); | |||||
vyy_2 = vec_perm(vy_2, vy_2, swap_mask); | |||||
vyy_3 = vec_perm(vy_3, vy_3, swap_mask); | |||||
vd_0 += vx_0 * vy_0; | |||||
vd_1 += vx_1 * vy_1; | |||||
vd_2 += vx_2 * vy_2; | |||||
vd_3 += vx_3 * vy_3; | |||||
vdd_0 += vx_0 * vyy_0; | |||||
vdd_1 += vx_1 * vyy_1; | |||||
vdd_2 += vx_2 * vyy_2; | |||||
vdd_3 += vx_3 * vyy_3; | |||||
} | |||||
//aggregate | |||||
vd_0 = vd_0 + vd_1 +vd_2 +vd_3; | |||||
vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3; | |||||
//reverse and aggregate | |||||
vd_1=vec_xxpermdi(vd_0,vd_0,2) ; | |||||
vdd_1=vec_xxpermdi(vdd_0,vdd_0,2); | |||||
vd_2=vd_0+vd_1; | |||||
vdd_2=vdd_0+vdd_1; | |||||
dot[0]=vd_2[0]; | |||||
dot[1]=vd_2[1]; | |||||
dot[2]=vdd_2[0]; | |||||
dot[3]=vdd_2[1]; | |||||
} | |||||
#endif | |||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||||
BLASLONG i = 0; | |||||
BLASLONG ix=0, iy=0; | |||||
OPENBLAS_COMPLEX_FLOAT result; | |||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | |||||
if (n <= 0) { | |||||
CREAL(result) = 0.0; | |||||
CIMAG(result) = 0.0; | |||||
return (result); | |||||
} | |||||
if ((inc_x == 1) && (inc_y == 1)) { | |||||
BLASLONG n1 = n & -8; | |||||
BLASLONG j=0; | |||||
if (n1){ | |||||
cdot_kernel_8(n1, x, y, dot); | |||||
i = n1; | |||||
j = n1 <<1; | |||||
} | |||||
while (i < n) { | |||||
dot[0] += x[j] * y[j]; | |||||
dot[1] += x[j + 1] * y[j + 1]; | |||||
dot[2] += x[j] * y[j + 1]; | |||||
dot[3] += x[j + 1] * y[j]; | |||||
j += 2; | |||||
i++; | |||||
} | |||||
} else { | |||||
i = 0; | |||||
ix = 0; | |||||
iy = 0; | |||||
inc_x <<= 1; | |||||
inc_y <<= 1; | |||||
while (i < n) { | |||||
dot[0] += x[ix] * y[iy]; | |||||
dot[1] += x[ix + 1] * y[iy + 1]; | |||||
dot[2] += x[ix] * y[iy + 1]; | |||||
dot[3] += x[ix + 1] * y[iy]; | |||||
ix += inc_x; | |||||
iy += inc_y; | |||||
i++; | |||||
} | |||||
} | |||||
#if !defined(CONJ) | |||||
CREAL(result) = dot[0] - dot[1]; | |||||
CIMAG(result) = dot[2] + dot[3]; | |||||
#else | |||||
CREAL(result) = dot[0] + dot[1]; | |||||
CIMAG(result) = dot[2] - dot[3]; | |||||
#endif | |||||
return (result); | |||||
} |
@@ -0,0 +1,585 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include <stdlib.h> | |||||
#include <stdio.h> | |||||
#include "common.h" | |||||
#include <altivec.h> | |||||
#define NBMAX 1024 | |||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
FLOAT *a0, *a1, *a2, *a3; | |||||
a0 = ap; | |||||
a1 = ap + lda; | |||||
a2 = a1 + lda; | |||||
a3 = a2 + lda; | |||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; | |||||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; | |||||
register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; | |||||
register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; | |||||
register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; | |||||
register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; | |||||
register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; | |||||
register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; | |||||
#else | |||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | |||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | |||||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; | |||||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; | |||||
register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; | |||||
register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; | |||||
register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; | |||||
register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; | |||||
#endif | |||||
register __vector float *vy = (__vector float *) y; | |||||
register __vector float *vptr_a0 = (__vector float *) a0; | |||||
register __vector float *vptr_a1 = (__vector float *) a1; | |||||
register __vector float *vptr_a2 = (__vector float *) a2; | |||||
register __vector float *vptr_a3 = (__vector float *) a3; | |||||
BLASLONG i = 0; | |||||
for (;i< n / 2; i+=2) { | |||||
register __vector float vy_0 = vy[i]; | |||||
register __vector float vy_1 = vy[i + 1]; | |||||
register __vector float va0 = vptr_a0[i]; | |||||
register __vector float va1 = vptr_a1[i]; | |||||
register __vector float va2 = vptr_a2[i]; | |||||
register __vector float va3 = vptr_a3[i]; | |||||
register __vector float va0_1 = vptr_a0[i + 1]; | |||||
register __vector float va1_1 = vptr_a1[i + 1]; | |||||
register __vector float va2_1 = vptr_a2[i + 1]; | |||||
register __vector float va3_1 = vptr_a3[i + 1]; | |||||
vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; | |||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; | |||||
va0 = vec_perm(va0, va0,swap_mask); | |||||
va0_1 = vec_perm(va0_1, va0_1,swap_mask); | |||||
va1 = vec_perm(va1, va1,swap_mask); | |||||
va1_1 = vec_perm(va1_1, va1_1,swap_mask); | |||||
va2 = vec_perm(va2, va2,swap_mask); | |||||
va2_1 = vec_perm(va2_1, va2_1,swap_mask); | |||||
va3 = vec_perm(va3, va3,swap_mask); | |||||
va3_1 = vec_perm(va3_1, va3_1,swap_mask); | |||||
vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; | |||||
vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; | |||||
vy[i] = vy_0; | |||||
vy[i + 1] = vy_1; | |||||
} | |||||
} | |||||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
FLOAT *a0, *a1; | |||||
a0 = ap; | |||||
a1 = ap + lda; | |||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; | |||||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; | |||||
register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; | |||||
register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; | |||||
#else | |||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | |||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | |||||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; | |||||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; | |||||
#endif | |||||
register __vector float *vy = (__vector float *) y; | |||||
register __vector float *vptr_a0 = (__vector float *) a0; | |||||
register __vector float *vptr_a1 = (__vector float *) a1; | |||||
BLASLONG i = 0; | |||||
for (;i< n / 2; i+=2) { | |||||
register __vector float vy_0 = vy[i]; | |||||
register __vector float vy_1 = vy[i + 1]; | |||||
register __vector float va0 = vptr_a0[i]; | |||||
register __vector float va1 = vptr_a1[i]; | |||||
register __vector float va0_1 = vptr_a0[i + 1]; | |||||
register __vector float va1_1 = vptr_a1[i + 1]; | |||||
register __vector float va0x = vec_perm(va0, va0,swap_mask); | |||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); | |||||
register __vector float va1x = vec_perm(va1, va1,swap_mask); | |||||
register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); | |||||
vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; | |||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; | |||||
vy[i] = vy_0; | |||||
vy[i + 1] = vy_1; | |||||
} | |||||
} | |||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; | |||||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; | |||||
#else | |||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | |||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | |||||
#endif | |||||
register __vector float *vy = (__vector float *) y; | |||||
register __vector float *vptr_a0 = (__vector float *) ap; | |||||
BLASLONG i = 0; | |||||
for (;i< n / 2; i+=2) { | |||||
register __vector float vy_0 = vy[i]; | |||||
register __vector float vy_1 = vy[i + 1]; | |||||
register __vector float va0 = vptr_a0[i]; | |||||
register __vector float va0_1 = vptr_a0[i + 1]; | |||||
register __vector float va0x = vec_perm(va0, va0,swap_mask); | |||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); | |||||
vy_0 += va0*vx0_r + va0x*vx0_i; | |||||
vy_1 += va0_1*vx0_r + va0x_1*vx0_i; | |||||
vy[i] = vy_0; | |||||
vy[i + 1] = vy_1; | |||||
} | |||||
} | |||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { | |||||
BLASLONG i; | |||||
if (inc_dest != 2) { | |||||
FLOAT temp_r; | |||||
FLOAT temp_i; | |||||
for ( i=0; i<n; i++ ) | |||||
{ | |||||
#if !defined(XCONJ) | |||||
temp_r = alpha_r * src[0] - alpha_i * src[1]; | |||||
temp_i = alpha_r * src[1] + alpha_i * src[0]; | |||||
#else | |||||
temp_r = alpha_r * src[0] + alpha_i * src[1]; | |||||
temp_i = -alpha_r * src[1] + alpha_i * src[0]; | |||||
#endif | |||||
*dest += temp_r; | |||||
*(dest+1) += temp_i; | |||||
src+=2; | |||||
dest += inc_dest; | |||||
} | |||||
return; | |||||
} else { | |||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||||
#if !defined(XCONJ) | |||||
register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r}; | |||||
register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i}; | |||||
#else | |||||
register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r}; | |||||
register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i}; | |||||
#endif | |||||
register __vector float *vptr_src = (__vector float *) src; | |||||
register __vector float *vptr_y = (__vector float *) dest; | |||||
for (i = 0; i < n/2; i += 2 ){ | |||||
register __vector float vy_0 = vptr_y[i]; | |||||
register __vector float vy_1 = vptr_y[i +1]; | |||||
register __vector float vsrc = vptr_src[i]; | |||||
register __vector float vsrc_1 = vptr_src[i + 1]; | |||||
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask); | |||||
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask); | |||||
vy_0 += vsrc*valpha_r + vsrcx*valpha_i; | |||||
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i; | |||||
vptr_y[i] = vy_0; | |||||
vptr_y[i+1 ] = vy_1; | |||||
} | |||||
} | |||||
return; | |||||
} | |||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { | |||||
BLASLONG i; | |||||
FLOAT *a_ptr; | |||||
FLOAT *x_ptr; | |||||
FLOAT *y_ptr; | |||||
BLASLONG n1; | |||||
BLASLONG m1; | |||||
BLASLONG m2; | |||||
BLASLONG m3; | |||||
BLASLONG n2; | |||||
FLOAT xbuffer[8], *ybuffer; | |||||
if (m < 1) return (0); | |||||
if (n < 1) return (0); | |||||
ybuffer = buffer; | |||||
inc_x *= 2; | |||||
inc_y *= 2; | |||||
lda *= 2; | |||||
n1 = n / 4; | |||||
n2 = n % 4; | |||||
m3 = m % 4; | |||||
m1 = m - (m % 4); | |||||
m2 = (m % NBMAX) - (m % 4); | |||||
y_ptr = y; | |||||
BLASLONG NB = NBMAX; | |||||
while (NB == NBMAX) { | |||||
m1 -= NB; | |||||
if (m1 < 0) { | |||||
if (m2 == 0) break; | |||||
NB = m2; | |||||
} | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
memset(ybuffer, 0, NB * 2*sizeof(FLOAT)); | |||||
if (inc_x == 2) { | |||||
for (i = 0; i < n1; i++) { | |||||
cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); | |||||
a_ptr += lda << 2; | |||||
x_ptr += 8; | |||||
} | |||||
if (n2 & 2) { | |||||
cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); | |||||
x_ptr += 4; | |||||
a_ptr += 2 * lda; | |||||
} | |||||
if (n2 & 1) { | |||||
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); | |||||
x_ptr += 2; | |||||
a_ptr += lda; | |||||
} | |||||
} else { | |||||
for (i = 0; i < n1; i++) { | |||||
xbuffer[0] = x_ptr[0]; | |||||
xbuffer[1] = x_ptr[1]; | |||||
x_ptr += inc_x; | |||||
xbuffer[2] = x_ptr[0]; | |||||
xbuffer[3] = x_ptr[1]; | |||||
x_ptr += inc_x; | |||||
xbuffer[4] = x_ptr[0]; | |||||
xbuffer[5] = x_ptr[1]; | |||||
x_ptr += inc_x; | |||||
xbuffer[6] = x_ptr[0]; | |||||
xbuffer[7] = x_ptr[1]; | |||||
x_ptr += inc_x; | |||||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer); | |||||
a_ptr += lda << 2; | |||||
} | |||||
for (i = 0; i < n2; i++) { | |||||
xbuffer[0] = x_ptr[0]; | |||||
xbuffer[1] = x_ptr[1]; | |||||
x_ptr += inc_x; | |||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); | |||||
a_ptr += lda; | |||||
} | |||||
} | |||||
add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); | |||||
a += 2 * NB; | |||||
y_ptr += NB * inc_y; | |||||
} | |||||
if (m3 == 0) return (0); | |||||
if (m3 == 1) { | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
FLOAT temp_r = 0.0; | |||||
FLOAT temp_i = 0.0; | |||||
if (lda == 2 && inc_x == 2) { | |||||
for (i = 0; i < (n & -2); i += 2) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; | |||||
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; | |||||
#else | |||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; | |||||
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; | |||||
#endif | |||||
a_ptr += 4; | |||||
x_ptr += 4; | |||||
} | |||||
for (; i < n; i++) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
#else | |||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
#endif | |||||
a_ptr += 2; | |||||
x_ptr += 2; | |||||
} | |||||
} else { | |||||
for (i = 0; i < n; i++) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
#else | |||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
#endif | |||||
a_ptr += lda; | |||||
x_ptr += inc_x; | |||||
} | |||||
} | |||||
#if !defined(XCONJ) | |||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
#else | |||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
#endif | |||||
return (0); | |||||
} | |||||
if (m3 == 2) { | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
FLOAT temp_r0 = 0.0; | |||||
FLOAT temp_i0 = 0.0; | |||||
FLOAT temp_r1 = 0.0; | |||||
FLOAT temp_i1 = 0.0; | |||||
if (lda == 4 && inc_x == 2) { | |||||
for (i = 0; i < (n & -2); i += 2) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; | |||||
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; | |||||
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; | |||||
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; | |||||
#else | |||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; | |||||
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; | |||||
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||||
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; | |||||
#endif | |||||
a_ptr += 8; | |||||
x_ptr += 4; | |||||
} | |||||
for (; i < n; i++) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
#else | |||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
#endif | |||||
a_ptr += 4; | |||||
x_ptr += 2; | |||||
} | |||||
} else { | |||||
for (i = 0; i < n; i++) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
#else | |||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
#endif | |||||
a_ptr += lda; | |||||
x_ptr += inc_x; | |||||
} | |||||
} | |||||
#if !defined(XCONJ) | |||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
#else | |||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
#endif | |||||
return (0); | |||||
} | |||||
if (m3 == 3) { | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
FLOAT temp_r0 = 0.0; | |||||
FLOAT temp_i0 = 0.0; | |||||
FLOAT temp_r1 = 0.0; | |||||
FLOAT temp_i1 = 0.0; | |||||
FLOAT temp_r2 = 0.0; | |||||
FLOAT temp_i2 = 0.0; | |||||
if (lda == 6 && inc_x == 2) { | |||||
for (i = 0; i < n; i++) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||||
#else | |||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||||
#endif | |||||
a_ptr += 6; | |||||
x_ptr += 2; | |||||
} | |||||
} else { | |||||
for (i = 0; i < n; i++) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||||
#else | |||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||||
#endif | |||||
a_ptr += lda; | |||||
x_ptr += inc_x; | |||||
} | |||||
} | |||||
#if !defined(XCONJ) | |||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||||
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||||
#else | |||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||||
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||||
#endif | |||||
return (0); | |||||
} | |||||
return (0); | |||||
} | |||||
@@ -0,0 +1,571 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#define NBMAX 1024 | |||||
#include <altivec.h> | |||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||||
BLASLONG i; | |||||
FLOAT *a0, *a1, *a2, *a3; | |||||
a0 = ap; | |||||
a1 = ap + lda; | |||||
a2 = a1 + lda; | |||||
a3 = a2 + lda; | |||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | |||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; | |||||
__vector float* va0 = (__vector float*) a0; | |||||
__vector float* va1 = (__vector float*) a1; | |||||
__vector float* va2 = (__vector float*) a2; | |||||
__vector float* va3 = (__vector float*) a3; | |||||
__vector float* v_x = (__vector float*) x; | |||||
for (i = 0; i < n / 2; i+=2) { | |||||
register __vector float vx_0 = v_x[i]; | |||||
register __vector float vx_1 = v_x[i+1]; | |||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | |||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | |||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; | |||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; | |||||
vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1]; | |||||
vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; | |||||
vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1]; | |||||
vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; | |||||
} | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | |||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; | |||||
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; | |||||
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; | |||||
register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; | |||||
register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; | |||||
register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; | |||||
register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; | |||||
#else | |||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; | |||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; | |||||
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; | |||||
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; | |||||
register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; | |||||
register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; | |||||
register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; | |||||
register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; | |||||
#endif | |||||
#if !defined(XCONJ) | |||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||||
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||||
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; | |||||
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; | |||||
#else | |||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||||
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||||
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; | |||||
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; | |||||
#endif | |||||
} | |||||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||||
BLASLONG i; | |||||
FLOAT *a0, *a1; | |||||
a0 = ap; | |||||
a1 = ap + lda; | |||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | |||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; | |||||
__vector float* va0 = (__vector float*) a0; | |||||
__vector float* va1 = (__vector float*) a1; | |||||
__vector float* v_x = (__vector float*) x; | |||||
for (i = 0; i < n / 2; i+=2) { | |||||
register __vector float vx_0 = v_x[i]; | |||||
register __vector float vx_1 = v_x[i+1]; | |||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | |||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | |||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; | |||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; | |||||
} | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | |||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; | |||||
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; | |||||
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; | |||||
#else | |||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; | |||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; | |||||
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; | |||||
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; | |||||
#endif | |||||
#if !defined(XCONJ) | |||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
#else | |||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
#endif | |||||
} | |||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||||
BLASLONG i; | |||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | |||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | |||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | |||||
__vector float* va0 = (__vector float*) ap; | |||||
__vector float* v_x = (__vector float*) x; | |||||
for (i = 0; i < n / 2; i+=2) { | |||||
register __vector float vx_0 = v_x[i]; | |||||
register __vector float vx_1 = v_x[i+1]; | |||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | |||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | |||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||||
} | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | |||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; | |||||
#else | |||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; | |||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; | |||||
#endif | |||||
#if !defined(XCONJ) | |||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
#else | |||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
#endif | |||||
} | |||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||||
BLASLONG i; | |||||
for (i = 0; i < n; i++) { | |||||
*dest = *src; | |||||
*(dest + 1) = *(src + 1); | |||||
dest += 2; | |||||
src += inc_src; | |||||
} | |||||
} | |||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||||
BLASLONG i; | |||||
BLASLONG j; | |||||
FLOAT *a_ptr; | |||||
FLOAT *x_ptr; | |||||
FLOAT *y_ptr; | |||||
BLASLONG n1; | |||||
BLASLONG m1; | |||||
BLASLONG m2; | |||||
BLASLONG m3; | |||||
BLASLONG n2; | |||||
FLOAT ybuffer[8], *xbuffer; | |||||
if (m < 1) return (0); | |||||
if (n < 1) return (0); | |||||
inc_x <<= 1; | |||||
inc_y <<= 1; | |||||
lda <<= 1; | |||||
xbuffer = buffer; | |||||
n1 = n >> 2; | |||||
n2 = n & 3; | |||||
m3 = m & 3; | |||||
m1 = m - m3; | |||||
m2 = (m & (NBMAX - 1)) - m3; | |||||
BLASLONG NB = NBMAX; | |||||
while (NB == NBMAX) { | |||||
m1 -= NB; | |||||
if (m1 < 0) { | |||||
if (m2 == 0) break; | |||||
NB = m2; | |||||
} | |||||
y_ptr = y; | |||||
a_ptr = a; | |||||
x_ptr = x; | |||||
if (inc_x != 2) | |||||
copy_x(NB, x_ptr, xbuffer, inc_x); | |||||
else | |||||
xbuffer = x_ptr; | |||||
if (inc_y == 2) { | |||||
for (i = 0; i < n1; i++) { | |||||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||||
a_ptr += lda << 2; | |||||
y_ptr += 8; | |||||
} | |||||
if (n2 & 2) { | |||||
cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||||
a_ptr += lda << 1; | |||||
y_ptr += 4; | |||||
} | |||||
if (n2 & 1) { | |||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||||
a_ptr += lda; | |||||
y_ptr += 2; | |||||
} | |||||
} else { | |||||
for (i = 0; i < n1; i++) { | |||||
memset(ybuffer, 0, sizeof (ybuffer)); | |||||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); | |||||
a_ptr += lda << 2; | |||||
y_ptr[0] += ybuffer[0]; | |||||
y_ptr[1] += ybuffer[1]; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += ybuffer[2]; | |||||
y_ptr[1] += ybuffer[3]; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += ybuffer[4]; | |||||
y_ptr[1] += ybuffer[5]; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += ybuffer[6]; | |||||
y_ptr[1] += ybuffer[7]; | |||||
y_ptr += inc_y; | |||||
} | |||||
for (i = 0; i < n2; i++) { | |||||
memset(ybuffer, 0, sizeof (ybuffer)); | |||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); | |||||
a_ptr += lda; | |||||
y_ptr[0] += ybuffer[0]; | |||||
y_ptr[1] += ybuffer[1]; | |||||
y_ptr += inc_y; | |||||
} | |||||
} | |||||
a += 2 * NB; | |||||
x += NB * inc_x; | |||||
} | |||||
if (m3 == 0) return (0); | |||||
x_ptr = x; | |||||
j = 0; | |||||
a_ptr = a; | |||||
y_ptr = y; | |||||
if (m3 == 3) { | |||||
FLOAT temp_r; | |||||
FLOAT temp_i; | |||||
FLOAT x0 = x_ptr[0]; | |||||
FLOAT x1 = x_ptr[1]; | |||||
x_ptr += inc_x; | |||||
FLOAT x2 = x_ptr[0]; | |||||
FLOAT x3 = x_ptr[1]; | |||||
x_ptr += inc_x; | |||||
FLOAT x4 = x_ptr[0]; | |||||
FLOAT x5 = x_ptr[1]; | |||||
while (j < n) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; | |||||
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; | |||||
#else | |||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; | |||||
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; | |||||
#endif | |||||
#if !defined(XCONJ) | |||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
#else | |||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
#endif | |||||
a_ptr += lda; | |||||
y_ptr += inc_y; | |||||
j++; | |||||
} | |||||
return (0); | |||||
} | |||||
if (m3 == 2) { | |||||
FLOAT temp_r; | |||||
FLOAT temp_i; | |||||
FLOAT temp_r1; | |||||
FLOAT temp_i1; | |||||
FLOAT x0 = x_ptr[0]; | |||||
FLOAT x1 = x_ptr[1]; | |||||
x_ptr += inc_x; | |||||
FLOAT x2 = x_ptr[0]; | |||||
FLOAT x3 = x_ptr[1]; | |||||
while (j < (n & -2)) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
a_ptr += lda; | |||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
#else | |||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
a_ptr += lda; | |||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
#endif | |||||
#if !defined(XCONJ) | |||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
#else | |||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
#endif | |||||
a_ptr += lda; | |||||
y_ptr += inc_y; | |||||
j += 2; | |||||
} | |||||
while (j < n) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||||
#else | |||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||||
#endif | |||||
#if !defined(XCONJ) | |||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
#else | |||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
#endif | |||||
a_ptr += lda; | |||||
y_ptr += inc_y; | |||||
j++; | |||||
} | |||||
return (0); | |||||
} | |||||
if (m3 == 1) { | |||||
FLOAT temp_r; | |||||
FLOAT temp_i; | |||||
FLOAT temp_r1; | |||||
FLOAT temp_i1; | |||||
FLOAT x0 = x_ptr[0]; | |||||
FLOAT x1 = x_ptr[1]; | |||||
while (j < (n & -2)) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
a_ptr += lda; | |||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
#else | |||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
a_ptr += lda; | |||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
#endif | |||||
#if !defined(XCONJ) | |||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
#else | |||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
y_ptr += inc_y; | |||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
#endif | |||||
a_ptr += lda; | |||||
y_ptr += inc_y; | |||||
j += 2; | |||||
} | |||||
while (j < n) { | |||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||||
#else | |||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||||
#endif | |||||
#if !defined(XCONJ) | |||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
#else | |||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
#endif | |||||
a_ptr += lda; | |||||
y_ptr += inc_y; | |||||
j++; | |||||
} | |||||
return (0); | |||||
} | |||||
return (0); | |||||
} | |||||
@@ -0,0 +1,231 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2018, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#if defined(POWER8) || defined(POWER9) | |||||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||||
{ | |||||
__vector float t0; | |||||
__vector float t1; | |||||
__vector float t2; | |||||
__vector float t3; | |||||
__vector float t4; | |||||
__vector float t5; | |||||
__vector float t6; | |||||
__vector float t7; | |||||
__asm__ | |||||
( | |||||
"xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||||
"xxspltw 36, 36, 0 \n\t" | |||||
"xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||||
"xxspltw 37, 37, 0 \n\t" | |||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||||
"addi %[x_ptr], %[x_ptr], 64 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], 64 \n\t" | |||||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||||
"ble 2f \n\t" | |||||
".p2align 5 \n\t" | |||||
"1: \n\t" | |||||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||||
"xvmulsp 41, 33, 36 \n\t" | |||||
"xvmulsp 42, 34, 36 \n\t" | |||||
"xvmulsp 43, 35, 36 \n\t" | |||||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||||
"xvmulsp %x[x2], 49, 36 \n\t" | |||||
"xvmulsp %x[x1], 50, 36 \n\t" | |||||
"xvmulsp %x[x3], 51, 36 \n\t" | |||||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||||
"xvmulsp 45, 33, 37 \n\t" | |||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||||
"xvmulsp 46, 34, 37 \n\t" | |||||
"xvmulsp 47, 35, 37 \n\t" | |||||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||||
"xvmulsp %x[x5], 49, 37 \n\t" | |||||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||||
"xvmulsp %x[x6], 50, 37 \n\t" | |||||
"xvmulsp %x[x7], 51, 37 \n\t" | |||||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||||
"addi %[x_ptr], %[x_ptr], 128 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], 128 \n\t" | |||||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||||
"bgt 1b \n\t" | |||||
"2: \n\t" | |||||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||||
"xvmulsp 41, 33, 36 \n\t" | |||||
"xvmulsp 42, 34, 36 \n\t" | |||||
"xvmulsp 43, 35, 36 \n\t" | |||||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||||
"xvmulsp %x[x2], 49, 36 \n\t" | |||||
"xvmulsp %x[x1], 50, 36 \n\t" | |||||
"xvmulsp %x[x3], 51, 36 \n\t" | |||||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||||
"xvmulsp 45, 33, 37 \n\t" | |||||
"xvmulsp 46, 34, 37 \n\t" | |||||
"xvmulsp 47, 35, 37 \n\t" | |||||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||||
"xvmulsp %x[x5], 49, 37 \n\t" | |||||
"xvmulsp %x[x6], 50, 37 \n\t" | |||||
"xvmulsp %x[x7], 51, 37 \n\t" | |||||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||||
"stxvd2x %x[x3], %[i48], %[y_ptr] " | |||||
: | |||||
[mem_x] "+m" (*(float (*)[2*n])x), | |||||
[mem_y] "+m" (*(float (*)[2*n])y), | |||||
[temp_n] "+r" (n), | |||||
[x_ptr] "+&b" (x), | |||||
[y_ptr] "+&b" (y), | |||||
[x0] "=wa" (t0), | |||||
[x1] "=wa" (t2), | |||||
[x2] "=wa" (t1), | |||||
[x3] "=wa" (t3), | |||||
[x4] "=wa" (t4), | |||||
[x5] "=wa" (t5), | |||||
[x6] "=wa" (t6), | |||||
[x7] "=wa" (t7) | |||||
: | |||||
[cos] "f" (c), | |||||
[sin] "f" (s), | |||||
[i16] "b" (16), | |||||
[i32] "b" (32), | |||||
[i48] "b" (48) | |||||
: | |||||
"cr0", | |||||
"vs32","vs33","vs34","vs35","vs36","vs37", | |||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
"vs48","vs49","vs50","vs51" | |||||
); | |||||
} | |||||
#endif | |||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
{ | |||||
BLASLONG i=0; | |||||
BLASLONG ix=0,iy=0; | |||||
FLOAT temp[2]; | |||||
BLASLONG inc_x2; | |||||
BLASLONG inc_y2; | |||||
if ( n <= 0 ) return(0); | |||||
if ( (inc_x == 1) && (inc_y == 1) ) | |||||
{ | |||||
BLASLONG n1 = n & -8; | |||||
if ( n1 > 0 ) | |||||
{ | |||||
crot_kernel_8(n1, x, y, c, s); | |||||
i=n1; | |||||
ix=2*n1; | |||||
} | |||||
while(i < n) | |||||
{ | |||||
temp[0] = c*x[ix] + s*y[ix] ; | |||||
temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||||
y[ix] = c*y[ix] - s*x[ix] ; | |||||
y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||||
x[ix] = temp[0] ; | |||||
x[ix+1] = temp[1] ; | |||||
ix += 2 ; | |||||
i++ ; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
inc_x2 = 2 * inc_x ; | |||||
inc_y2 = 2 * inc_y ; | |||||
while(i < n) | |||||
{ | |||||
temp[0] = c*x[ix] + s*y[iy] ; | |||||
temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||||
y[iy] = c*y[iy] - s*x[ix] ; | |||||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||||
x[ix] = temp[0] ; | |||||
x[ix+1] = temp[1] ; | |||||
ix += inc_x2 ; | |||||
iy += inc_y2 ; | |||||
i++ ; | |||||
} | |||||
} | |||||
return(0); | |||||
} | |||||
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if defined(POWER8) | |||||
#if defined(POWER8) || defined(POWER9) | |||||
#include "cswap_microk_power8.c" | #include "cswap_microk_power8.c" | ||||
#endif | #endif | ||||
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
#if defined(POWER8) | |||||
#if defined(POWER8) || defined(POWER9) | |||||
#include "dasum_microk_power8.c" | #include "dasum_microk_power8.c" | ||||
#endif | #endif | ||||
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if defined(POWER8) | |||||
#if defined(POWER8) || defined(POWER9) | |||||
#include "daxpy_microk_power8.c" | #include "daxpy_microk_power8.c" | ||||
#endif | #endif | ||||
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if defined(POWER8) | |||||
#if defined(POWER8) || defined(POWER9) | |||||
#include "dcopy_microk_power8.c" | #include "dcopy_microk_power8.c" | ||||
#endif | #endif | ||||
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if defined(POWER8) | |||||
#if defined(POWER8) || defined(POWER9) | |||||
#include "ddot_microk_power8.c" | #include "ddot_microk_power8.c" | ||||
#endif | #endif | ||||
@@ -0,0 +1,249 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2013-2019, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "def_vsx.h" | |||||
#define LOAD ld | |||||
#define STACKSIZE (512 ) | |||||
#define ALPHA_SP (296+192)(SP) | |||||
#define FZERO (304+192)(SP) | |||||
#define M r3 | |||||
#define N r4 | |||||
#define K r5 | |||||
#define A r7 | |||||
#define B r8 | |||||
#define C r9 | |||||
#define LDC r10 | |||||
#define OFFSET r6 | |||||
#define alpha_r vs18 | |||||
#define o0 0 | |||||
#define T4 r12 | |||||
#define T3 r11 | |||||
#define C4 r14 | |||||
#define o8 r15 | |||||
#define o24 r16 | |||||
#define C2 r17 | |||||
#define L r18 | |||||
#define T1 r19 | |||||
#define C3 r20 | |||||
#define TEMP_REG r21 | |||||
#define I r22 | |||||
#define J r23 | |||||
#define AO r24 | |||||
#define BO r25 | |||||
#define CO r26 | |||||
#define o16 r27 | |||||
#define o32 r28 | |||||
#define o48 r29 | |||||
#define PRE r30 | |||||
#define T2 r31 | |||||
#include "dgemm_macros_power9.S" | |||||
#ifndef NEEDPARAM | |||||
PROLOGUE | |||||
PROFCODE | |||||
addi SP, SP, -STACKSIZE | |||||
li r0, 0 | |||||
stfd f14, 0(SP) | |||||
stfd f15, 8(SP) | |||||
stfd f16, 16(SP) | |||||
stfd f17, 24(SP) | |||||
stfd f18, 32(SP) | |||||
stfd f19, 40(SP) | |||||
stfd f20, 48(SP) | |||||
stfd f21, 56(SP) | |||||
stfd f22, 64(SP) | |||||
stfd f23, 72(SP) | |||||
stfd f24, 80(SP) | |||||
stfd f25, 88(SP) | |||||
stfd f26, 96(SP) | |||||
stfd f27, 104(SP) | |||||
stfd f28, 112(SP) | |||||
stfd f29, 120(SP) | |||||
stfd f30, 128(SP) | |||||
stfd f31, 136(SP) | |||||
std r31, 144(SP) | |||||
std r30, 152(SP) | |||||
std r29, 160(SP) | |||||
std r28, 168(SP) | |||||
std r27, 176(SP) | |||||
std r26, 184(SP) | |||||
std r25, 192(SP) | |||||
std r24, 200(SP) | |||||
std r23, 208(SP) | |||||
std r22, 216(SP) | |||||
std r21, 224(SP) | |||||
std r20, 232(SP) | |||||
std r19, 240(SP) | |||||
std r18, 248(SP) | |||||
std r17, 256(SP) | |||||
std r16, 264(SP) | |||||
std r15, 272(SP) | |||||
std r14, 280(SP) | |||||
stxv v20, 288(SP) | |||||
stxv v21, 304(SP) | |||||
stxv v22, 320(SP) | |||||
stxv v23, 336(SP) | |||||
stxv v24, 352(SP) | |||||
stxv v25, 368(SP) | |||||
stxv v26, 384(SP) | |||||
stxv v27, 400(SP) | |||||
stxv v28, 416(SP) | |||||
stxv v29, 432(SP) | |||||
stxv v30, 448(SP) | |||||
stxv v31, 464(SP) | |||||
stfd f1, ALPHA_SP | |||||
stw r0, FZERO | |||||
slwi LDC, LDC, BASE_SHIFT | |||||
#if defined(TRMMKERNEL) | |||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
#endif | |||||
cmpwi cr0, M, 0 | |||||
ble .L999_H1 | |||||
cmpwi cr0, N, 0 | |||||
ble .L999_H1 | |||||
cmpwi cr0, K, 0 | |||||
ble .L999_H1 | |||||
addi T1, SP, 296+192 | |||||
li PRE, 384 | |||||
li o8 , 8 | |||||
li o16, 16 | |||||
li o24, 24 | |||||
li o32, 32 | |||||
li o48, 48 | |||||
lxvdsx alpha_r, 0, T1 | |||||
#include "dgemm_logic_power9.S" | |||||
.L999: | |||||
addi r3, 0, 0 | |||||
lfd f14, 0(SP) | |||||
lfd f15, 8(SP) | |||||
lfd f16, 16(SP) | |||||
lfd f17, 24(SP) | |||||
lfd f18, 32(SP) | |||||
lfd f19, 40(SP) | |||||
lfd f20, 48(SP) | |||||
lfd f21, 56(SP) | |||||
lfd f22, 64(SP) | |||||
lfd f23, 72(SP) | |||||
lfd f24, 80(SP) | |||||
lfd f25, 88(SP) | |||||
lfd f26, 96(SP) | |||||
lfd f27, 104(SP) | |||||
lfd f28, 112(SP) | |||||
lfd f29, 120(SP) | |||||
lfd f30, 128(SP) | |||||
lfd f31, 136(SP) | |||||
ld r31, 144(SP) | |||||
ld r30, 152(SP) | |||||
ld r29, 160(SP) | |||||
ld r28, 168(SP) | |||||
ld r27, 176(SP) | |||||
ld r26, 184(SP) | |||||
ld r25, 192(SP) | |||||
ld r24, 200(SP) | |||||
ld r23, 208(SP) | |||||
ld r22, 216(SP) | |||||
ld r21, 224(SP) | |||||
ld r20, 232(SP) | |||||
ld r19, 240(SP) | |||||
ld r18, 248(SP) | |||||
ld r17, 256(SP) | |||||
ld r16, 264(SP) | |||||
ld r15, 272(SP) | |||||
ld r14, 280(SP) | |||||
lxv v20, 288(SP) | |||||
lxv v21, 304(SP) | |||||
lxv v22, 320(SP) | |||||
lxv v23, 336(SP) | |||||
lxv v24, 352(SP) | |||||
lxv v25, 368(SP) | |||||
lxv v26, 384(SP) | |||||
lxv v27, 400(SP) | |||||
lxv v28, 416(SP) | |||||
lxv v29, 432(SP) | |||||
lxv v30, 448(SP) | |||||
lxv v31, 464(SP) | |||||
addi SP, SP, STACKSIZE | |||||
blr | |||||
EPILOGUE | |||||
#endif |
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#if defined(POWER8) | |||||
#if defined(POWER8) || defined(POWER9) | |||||
#include "dgemv_n_microk_power8.c" | #include "dgemv_n_microk_power8.c" | ||||
#endif | #endif | ||||