Merge develop in preparation of 0.3.6 releasetags/v0.3.6
@@ -149,7 +149,7 @@ matrix: | |||
- &test-macos | |||
os: osx | |||
osx_image: xcode8 | |||
osx_image: xcode10.1 | |||
before_script: | |||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
- brew update | |||
@@ -160,6 +160,7 @@ matrix: | |||
- BTYPE="BINARY=64 INTERFACE64=1" | |||
- <<: *test-macos | |||
osx_image: xcode8.3 | |||
env: | |||
- BTYPE="BINARY=32" | |||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||
project(OpenBLAS C ASM) | |||
set(OpenBLAS_MAJOR_VERSION 0) | |||
set(OpenBLAS_MINOR_VERSION 3) | |||
set(OpenBLAS_PATCH_VERSION 5) | |||
set(OpenBLAS_PATCH_VERSION 6) | |||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
# Adhere to GNU filesystem layout conventions | |||
@@ -42,6 +42,19 @@ endif() | |||
####### | |||
if(MSVC AND MSVC_STATIC_CRT) | |||
set(CompilerFlags | |||
CMAKE_CXX_FLAGS | |||
CMAKE_CXX_FLAGS_DEBUG | |||
CMAKE_CXX_FLAGS_RELEASE | |||
CMAKE_C_FLAGS | |||
CMAKE_C_FLAGS_DEBUG | |||
CMAKE_C_FLAGS_RELEASE | |||
) | |||
foreach(CompilerFlag ${CompilerFlags}) | |||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||
endforeach() | |||
endif() | |||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | |||
@@ -62,10 +75,10 @@ endif () | |||
set(SUBDIRS ${BLASDIRS}) | |||
if (NOT NO_LAPACK) | |||
list(APPEND SUBDIRS lapack) | |||
if(BUILD_RELAPACK) | |||
list(APPEND SUBDIRS relapack/src) | |||
endif() | |||
list(APPEND SUBDIRS lapack) | |||
endif () | |||
# set which float types we want to build for | |||
@@ -134,7 +147,7 @@ endif () | |||
# Only generate .def for dll on MSVC and always produce pdb files for debug and release | |||
if(MSVC) | |||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) | |||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) | |||
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") | |||
endif() | |||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") | |||
@@ -149,15 +162,9 @@ if (${DYNAMIC_ARCH}) | |||
endforeach() | |||
endif () | |||
# Only build shared libs for MSVC | |||
if (MSVC) | |||
set(BUILD_SHARED_LIBS ON) | |||
endif() | |||
# add objects to the openblas lib | |||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>) | |||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
# Android needs to explicitly link against libm | |||
if(ANDROID) | |||
@@ -166,7 +173,7 @@ endif() | |||
# Handle MSVC exports | |||
if(MSVC AND BUILD_SHARED_LIBS) | |||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) | |||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) | |||
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | |||
else() | |||
# Creates verbose .def file (51KB vs 18KB) | |||
@@ -217,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||
SOVERSION ${OpenBLAS_MAJOR_VERSION} | |||
) | |||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
if (NOT MSVC) | |||
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") | |||
else() | |||
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") | |||
endif() | |||
endif() | |||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") | |||
if (NOT DEFINED ARCH) | |||
set(ARCH_IN "x86_64") | |||
@@ -314,7 +329,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
if(NOT NOFORTRAN) | |||
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) | |||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h) | |||
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") | |||
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") | |||
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") | |||
@@ -327,10 +342,11 @@ endif() | |||
if(NOT NO_CBLAS) | |||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") | |||
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) | |||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
endif() | |||
if(NOT NO_LAPACKE) | |||
@@ -1,4 +1,82 @@ | |||
OpenBLAS ChangeLog | |||
==================================================================== | |||
Version 0.3.6 | |||
29-Apr-2019 | |||
common: | |||
* the build tools now check that a given cpu TARGET is actually valid | |||
* the build-time check of system features (c_check) has been made | |||
less dependent on particular perl features (this should mainly | |||
benefit building on Windows) | |||
* several problem with the ReLAPACK integration were fixed, | |||
including INTERFACE64 support and building a shared library | |||
* building with CMAKE on BSD systems was improved | |||
* a non-absolute SUM function was added based on the | |||
existing optimized code for ASUM | |||
* CBLAS interfaces to the IxMIN and IxMAX functions were added | |||
* a name clash between LAPACKE and BOOST headers was resolved | |||
* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel | |||
kernels | |||
* a crash on thread (key) deletion with the USE_TLS=1 memory management | |||
option was fixed | |||
* restored several earlier fixes, in particular for OpenMP performance, | |||
building on BSD, and calling fork on CYGWIN, which had inadvertently | |||
been dropped in the 0.3.3 rewrite of the memory management code. | |||
x86_64: | |||
* the AVX512 DGEMM kernel has been disabled again due to unsolved problems | |||
* building with old versions of MSVC was fixed | |||
* it is now possible to build a static library on Windows with CMAKE | |||
* accessing environment variables on CYGWIN at run time was fixed | |||
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware | |||
* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected | |||
* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported | |||
with CMAKE as well | |||
* building for DYNAMIC_ARCH with GENERIC as the default target is now supported | |||
* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed | |||
* assembly bugs involving undeclared modification of input operands were fixed | |||
in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem, | |||
Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause | |||
test failures or segfaults when compiled with recent versions of gcc from 8 onward. | |||
* a similar bug was fixed in the blas_quickdivide code used to split workloads | |||
in most functions | |||
* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX | |||
* fixed building on SkylakeX systems when either the compiler or the (emulated) operating | |||
environment does not support AVX512 | |||
* improved GEMM performance on ZEN targets | |||
x86: | |||
* build failures caused by the recently added checks for AVX512 were fixed | |||
* an inline assembly bug involving undeclared modification of an input argument was | |||
fixed in the blas_quickdivide code used to split workloads in most functions | |||
* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX | |||
MIPS32: | |||
* a bug in the IMIN implementation made it return the result of IMAX | |||
POWER: | |||
* single precision BLAS1/2 functions have received optimized POWER8 kernels | |||
* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel | |||
* building on PPC970 systems under OSX Leopard or Tiger is now supported | |||
* out-of-bounds memory accesses in the gemm_beta microkernels were fixed | |||
* building a shared library on AIX is now supported for POWER6 | |||
* DYNAMIC_ARCH support has been added for POWER6 and newer | |||
ARMv7: | |||
* corrected xDOT behaviour with zero INC_X or INC_Y | |||
* a bug in the IMIN implementation made it return the result of IMAX | |||
ARMv8: | |||
* added support for HiSilicon TSV110 cpus | |||
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware | |||
* cross-compilation with CMAKE now works again | |||
* a bug in the IMIN implementation made it return the result of IMAX | |||
* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 | |||
IBM Z: | |||
* optimized microkernels for single precicion BLAS1/2 functions have been added | |||
for both Z13 and Z14 | |||
==================================================================== | |||
Version 0.3.5 | |||
31-Dec-2018 | |||
@@ -96,7 +96,7 @@ endif | |||
@echo | |||
shared : | |||
ifndef NO_SHARED | |||
ifneq ($(NO_SHARED), 1) | |||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
@$(MAKE) -C exports so | |||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
@@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99) | |||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
endif | |||
ifeq ($(CORE), TSV110) | |||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
endif |
@@ -58,14 +58,14 @@ ifndef NO_LAPACKE | |||
endif | |||
#for install static library | |||
ifndef NO_STATIC | |||
ifneq ($(NO_STATIC),1) | |||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
endif | |||
#for install shared library | |||
ifndef NO_SHARED | |||
ifneq ($(NO_SHARED),1) | |||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
@@ -106,14 +106,14 @@ ifndef NO_LAPACKE | |||
endif | |||
#for install static library | |||
ifndef NO_STATIC | |||
ifneq ($(NO_STATIC),1) | |||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
endif | |||
#for install shared library | |||
ifndef NO_SHARED | |||
ifneq ($(NO_SHARED),1) | |||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
@@ -138,7 +138,7 @@ endif | |||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
ifndef NO_SHARED | |||
ifneq ($(NO_SHARED),1) | |||
#ifeq logical or | |||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) | |||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@@ -9,7 +9,15 @@ else | |||
USE_OPENMP = 1 | |||
endif | |||
ifeq ($(CORE), POWER9) | |||
ifeq ($(USE_OPENMP), 1) | |||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
else | |||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math | |||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math | |||
endif | |||
endif | |||
ifeq ($(CORE), POWER8) | |||
ifeq ($(USE_OPENMP), 1) | |||
@@ -3,7 +3,7 @@ | |||
# | |||
# This library's version | |||
VERSION = 0.3.5 | |||
VERSION = 0.3.6 | |||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
@@ -48,6 +48,8 @@ VERSION = 0.3.5 | |||
# HOSTCC = gcc | |||
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 | |||
# Please note that AVX is not available on 32-bit. | |||
# Setting BINARY=32 disables AVX/AVX2/AVX-512. | |||
# BINARY=64 | |||
# About threaded BLAS. It will be automatically detected if you don't | |||
@@ -57,7 +59,7 @@ VERSION = 0.3.5 | |||
# USE_THREAD = 0 | |||
# If you're going to use this library with OpenMP, please comment it in. | |||
# This flag is always set for POWER8. Don't modify the flag | |||
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. | |||
# USE_OPENMP = 1 | |||
# The OpenMP scheduler to use - by default this is "static" and you | |||
@@ -68,36 +70,45 @@ VERSION = 0.3.5 | |||
# allow you to select the scheduler from the environment variable OMP_SCHEDULE | |||
# CCOMMON_OPT += -DOMP_SCHED=dynamic | |||
# You can define maximum number of threads. Basically it should be | |||
# less than actual number of cores. If you don't specify one, it's | |||
# automatically detected by the the script. | |||
# You can define the maximum number of threads. Basically it should be less | |||
# than or equal to the number of CPU threads. If you don't specify one, it's | |||
# automatically detected by the build system. | |||
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to | |||
# restrict NUM_THREADS to the number of physical cores. By default, the automatic | |||
# detection includes logical CPUs, thus allowing the use of SMT. | |||
# Users may opt at runtime to use less than NUM_THREADS threads. | |||
# | |||
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS | |||
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way | |||
# some internal structures are allocated, using a large NUM_THREADS value has a RAM | |||
# footprint penalty, even if users reduce the actual number of threads at runtime. | |||
# NUM_THREADS = 24 | |||
# If you have enabled USE_OPENMP and your application would call | |||
# OpenBLAS's calculation API from multi threads, please comment it in. | |||
# This flag defines how many instances of OpenBLAS's calculation API can | |||
# actually run in parallel. If more threads call OpenBLAS's calculation API, | |||
# OpenBLAS's calculation API from multiple threads, please comment this in. | |||
# This flag defines how many instances of OpenBLAS's calculation API can actually | |||
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API, | |||
# they need to wait for the preceding API calls to finish or risk data corruption. | |||
# NUM_PARALLEL = 2 | |||
# if you don't need to install the static library, please comment it in. | |||
# If you don't need to install the static library, please comment this in. | |||
# NO_STATIC = 1 | |||
# if you don't need generate the shared library, please comment it in. | |||
# If you don't need to generate the shared library, please comment this in. | |||
# NO_SHARED = 1 | |||
# If you don't need CBLAS interface, please comment it in. | |||
# If you don't need the CBLAS interface, please comment this in. | |||
# NO_CBLAS = 1 | |||
# If you only want CBLAS interface without installing Fortran compiler, | |||
# please comment it in. | |||
# If you only want the CBLAS interface without installing a Fortran compiler, | |||
# please comment this in. | |||
# ONLY_CBLAS = 1 | |||
# If you don't need LAPACK, please comment it in. | |||
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. | |||
# If you don't need LAPACK, please comment this in. | |||
# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1. | |||
# NO_LAPACK = 1 | |||
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in. | |||
# If you don't need LAPACKE (C Interface to LAPACK), please comment this in. | |||
# NO_LAPACKE = 1 | |||
# Build LAPACK Deprecated functions since LAPACK 3.6.0 | |||
@@ -106,7 +117,7 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
# Build RecursiveLAPACK on top of LAPACK | |||
# BUILD_RELAPACK = 1 | |||
# If you want to use legacy threaded Level 3 implementation. | |||
# If you want to use the legacy threaded Level 3 implementation. | |||
# USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
# If you want to use the new, still somewhat experimental code that uses | |||
@@ -116,8 +127,8 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
# USE_TLS = 1 | |||
# If you want to drive whole 64bit region by BLAS. Not all Fortran | |||
# compiler supports this. It's safe to keep comment it out if you | |||
# are not sure(equivalent to "-i8" option). | |||
# compilers support this. It's safe to keep this commented out if you | |||
# are not sure. (This is equivalent to the "-i8" ifort option). | |||
# INTERFACE64 = 1 | |||
# Unfortunately most of kernel won't give us high quality buffer. | |||
@@ -125,10 +136,18 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
# but it will consume time. If you don't like it, you can disable one. | |||
NO_WARMUP = 1 | |||
# If you want to disable CPU/Memory affinity on Linux. | |||
# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling. | |||
# This feature is only implemented on Linux, and is always disabled on other platforms. | |||
# Enabling affinity handling may improve performance, especially on NUMA systems, but | |||
# it may conflict with certain applications that also try to manage affinity. | |||
# This conflict can result in threads of the application calling OpenBLAS ending up locked | |||
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core. | |||
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing | |||
# else modifies affinity settings. | |||
# Note: enabling affinity has been known to cause problems with NumPy and R | |||
NO_AFFINITY = 1 | |||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||
# BIGNUMA = 1 | |||
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers | |||
@@ -180,7 +199,7 @@ NO_AFFINITY = 1 | |||
# been reported to be optimal for certain workloads (50 is the recommended value for Julia). | |||
# GEMM_MULTITHREAD_THRESHOLD = 4 | |||
# If you need santy check by comparing reference BLAS. It'll be very | |||
# If you need sanity check by comparing results to reference BLAS. It'll be very | |||
# slow (Not implemented yet). | |||
# SANITY_CHECK = 1 | |||
@@ -65,6 +65,7 @@ endif | |||
ifdef TARGET | |||
GETARCH_FLAGS := -DFORCE_$(TARGET) | |||
GETARCH_FLAGS += -DUSER_TARGET | |||
endif | |||
# Force fallbacks for 32bit | |||
@@ -94,6 +95,9 @@ endif | |||
ifeq ($(TARGET), ZEN) | |||
GETARCH_FLAGS := -DFORCE_BARCELONA | |||
endif | |||
ifeq ($(TARGET), ARMV8) | |||
GETARCH_FLAGS := -DFORCE_ARMV7 | |||
endif | |||
endif | |||
@@ -151,7 +155,8 @@ GETARCH_FLAGS += -DNO_AVX | |||
endif | |||
ifeq ($(BINARY), 32) | |||
GETARCH_FLAGS += -DNO_AVX | |||
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 | |||
NO_AVX512 = 1 | |||
endif | |||
ifeq ($(NO_AVX2), 1) | |||
@@ -523,6 +528,12 @@ DYNAMIC_CORE += THUNDERX | |||
DYNAMIC_CORE += THUNDERX2T99 | |||
endif | |||
ifeq ($(ARCH), power) | |||
DYNAMIC_CORE = POWER6 | |||
DYNAMIC_CORE += POWER8 | |||
DYNAMIC_CORE += POWER9 | |||
endif | |||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | |||
ifndef DYNAMIC_CORE | |||
override DYNAMIC_ARCH= | |||
@@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector | |||
FCOMMON_OPT += -march=z13 -mzvector | |||
endif | |||
ifeq ($(CORE), Z14) | |||
CCOMMON_OPT += -march=z14 -mzvector | |||
FCOMMON_OPT += -march=z14 -mzvector | |||
endif |
@@ -48,6 +48,7 @@ POWER5 | |||
POWER6 | |||
POWER7 | |||
POWER8 | |||
POWER9 | |||
PPCG4 | |||
PPC970 | |||
PPC970MP | |||
@@ -90,7 +91,9 @@ CORTEXA73 | |||
FALKOR | |||
THUNDERX | |||
THUNDERX2T99 | |||
TSV110 | |||
9.System Z: | |||
ZARCH_GENERIC | |||
Z13 | |||
Z14 |
@@ -53,9 +53,9 @@ before_build: | |||
- ps: if (-Not (Test-Path .\build)) { mkdir build } | |||
- cd build | |||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. | |||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. | |||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. | |||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | |||
build_script: | |||
- cmake --build . | |||
@@ -2,6 +2,8 @@ | |||
argv <- commandArgs(trailingOnly = TRUE) | |||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||
nfrom <- 128 | |||
nto <- 2048 | |||
nstep <- 128 | |||
@@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||
loops <- as.numeric(argv[z]) | |||
} | |||
} | |||
} | |||
p <- Sys.getenv("OPENBLAS_LOOPS") | |||
@@ -27,29 +28,21 @@ if (p != "") { | |||
loops <- as.numeric(p) | |||
} | |||
cat(sprintf( | |||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||
nfrom, | |||
nto, | |||
nstep, | |||
loops | |||
)) | |||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||
cat(sprintf(" SIZE Flops Time\n")) | |||
n <- nfrom | |||
while (n <= nto) { | |||
A <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||
A <- matrix(rnorm(n * n), nrow = n) | |||
ev <- 0 | |||
z <- system.time(for (l in 1:loops) { | |||
ev <- eigen(A) | |||
}) | |||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) | |||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06) | |||
st <- sprintf("%.0fx%.0f :", n, n) | |||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||
n <- n + nstep | |||
} |
@@ -2,6 +2,8 @@ | |||
argv <- commandArgs(trailingOnly = TRUE) | |||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||
nfrom <- 128 | |||
nto <- 2048 | |||
nstep <- 128 | |||
@@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||
loops <- as.numeric(argv[z]) | |||
} | |||
} | |||
} | |||
p <- Sys.getenv("OPENBLAS_LOOPS") | |||
@@ -27,26 +28,13 @@ if (p != "") { | |||
loops <- as.numeric(p) | |||
} | |||
cat(sprintf( | |||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||
nfrom, | |||
nto, | |||
nstep, | |||
loops | |||
)) | |||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||
cat(sprintf(" SIZE Flops Time\n")) | |||
n <- nfrom | |||
while (n <= nto) { | |||
A <- matrix(runif(n * n), | |||
ncol = n, | |||
nrow = n, | |||
byrow = TRUE) | |||
B <- matrix(runif(n * n), | |||
ncol = n, | |||
nrow = n, | |||
byrow = TRUE) | |||
A <- matrix(runif(n * n), nrow = n) | |||
B <- matrix(runif(n * n), nrow = n) | |||
C <- 1 | |||
z <- system.time(for (l in 1:loops) { | |||
@@ -54,11 +42,10 @@ while (n <= nto) { | |||
l <- l + 1 | |||
}) | |||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) | |||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06) | |||
st <- sprintf("%.0fx%.0f :", n, n) | |||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||
n <- n + nstep | |||
} |
@@ -2,6 +2,8 @@ | |||
argv <- commandArgs(trailingOnly = TRUE) | |||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||
nfrom <- 128 | |||
nto <- 2048 | |||
nstep <- 128 | |||
@@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||
loops <- as.numeric(argv[z]) | |||
} | |||
} | |||
} | |||
p <- Sys.getenv("OPENBLAS_LOOPS") | |||
@@ -27,31 +28,22 @@ if (p != "") { | |||
loops <- as.numeric(p) | |||
} | |||
cat(sprintf( | |||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||
nfrom, | |||
nto, | |||
nstep, | |||
loops | |||
)) | |||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||
cat(sprintf(" SIZE Flops Time\n")) | |||
n <- nfrom | |||
while (n <= nto) { | |||
A <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||
B <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||
A <- matrix(rnorm(n * n), nrow = n) | |||
B <- matrix(rnorm(n * n), nrow = n) | |||
z <- system.time(for (l in 1:loops) { | |||
solve(A, B) | |||
}) | |||
mflops <- | |||
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) | |||
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06) | |||
st <- sprintf("%.0fx%.0f :", n, n) | |||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||
n <- n + nstep | |||
} |
@@ -1,7 +1,7 @@ | |||
#!/usr/bin/perl | |||
use File::Basename; | |||
use File::Temp qw(tempfile); | |||
#use File::Basename; | |||
# use File::Temp qw(tempfile); | |||
# Checking cross compile | |||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
@@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64"); | |||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | |||
$hostarch = "zarch" if ($hostarch eq "s390x"); | |||
$tmpf = new File::Temp( UNLINK => 1 ); | |||
#$tmpf = new File::Temp( UNLINK => 1 ); | |||
$binary = $ENV{"BINARY"}; | |||
$makefile = shift(@ARGV); | |||
@@ -31,12 +31,25 @@ if ($?) { | |||
$cross_suffix = ""; | |||
if (dirname($compiler_name) ne ".") { | |||
$cross_suffix .= dirname($compiler_name) . "/"; | |||
} | |||
eval "use File::Basename"; | |||
if ($@){ | |||
warn "could not load PERL module File::Basename, emulating its functionality"; | |||
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 ); | |||
if ($dirnam ne ".") { | |||
$cross_suffix .= $dirnam . "/"; | |||
} | |||
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1); | |||
if ($basnam =~ /([^\s]*-)(.*)/) { | |||
$cross_suffix .= $1; | |||
} | |||
} else { | |||
if (dirname($compiler_name) ne ".") { | |||
$cross_suffix .= dirname($compiler_name) . "/"; | |||
} | |||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||
$cross_suffix .= $1; | |||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||
$cross_suffix .= $1; | |||
} | |||
} | |||
$compiler = ""; | |||
@@ -171,20 +184,26 @@ if ($?) { | |||
$have_msa = 0; | |||
if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
$code = '"addvi.b $w0, $w1, 1"'; | |||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||
print $tmpf "#include <msa.h>\n\n"; | |||
print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
$args = "$msa_flags -o $tmpf.o -x c $tmpf"; | |||
my @cmd = ("$compiler_name $args"); | |||
system(@cmd) == 0; | |||
if ($? != 0) { | |||
$have_msa = 0; | |||
eval "use File::Temp qw(tempfile)"; | |||
if ($@){ | |||
warn "could not load PERL module File::Temp, so could not check MSA capatibility"; | |||
} else { | |||
$have_msa = 1; | |||
$tmpf = new File::Temp( UNLINK => 1 ); | |||
$code = '"addvi.b $w0, $w1, 1"'; | |||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||
print $tmpf "#include <msa.h>\n\n"; | |||
print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
$args = "$msa_flags -o $tmpf.o -x c $tmpf"; | |||
my @cmd = ("$compiler_name $args"); | |||
system(@cmd) == 0; | |||
if ($? != 0) { | |||
$have_msa = 0; | |||
} else { | |||
$have_msa = 1; | |||
} | |||
unlink("$tmpf.o"); | |||
} | |||
unlink("$tmpf.o"); | |||
} | |||
$architecture = x86 if ($data =~ /ARCH_X86/); | |||
@@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/); | |||
$no_avx512= 0; | |||
if (($architecture eq "x86") || ($architecture eq "x86_64")) { | |||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; | |||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); | |||
system(@cmd) == 0; | |||
if ($? != 0) { | |||
$no_avx512 = 1; | |||
} else { | |||
eval "use File::Temp qw(tempfile)"; | |||
if ($@){ | |||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512"; | |||
$no_avx512 = 0; | |||
} else { | |||
# $tmpf = new File::Temp( UNLINK => 1 ); | |||
($fh,$tmpf) = tempfile( UNLINK => 1 ); | |||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf"; | |||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); | |||
system(@cmd) == 0; | |||
if ($? != 0) { | |||
$no_avx512 = 1; | |||
} else { | |||
$no_avx512 = 0; | |||
} | |||
unlink("tmpf.o"); | |||
} | |||
unlink("tmpf.o"); | |||
} | |||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | |||
@@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS | |||
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); | |||
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); | |||
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); | |||
@@ -88,6 +93,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | |||
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | |||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
@@ -74,6 +74,9 @@ if (DYNAMIC_ARCH) | |||
if (NOT NO_AVX512) | |||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) | |||
endif () | |||
if (DYNAMIC_LIST) | |||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) | |||
endif () | |||
endif () | |||
if (NOT DYNAMIC_CORE) | |||
@@ -107,6 +107,12 @@ macro(SetDefaultL1) | |||
set(DAXPBYKERNEL ../arm/axpby.c) | |||
set(CAXPBYKERNEL ../arm/zaxpby.c) | |||
set(ZAXPBYKERNEL ../arm/zaxpby.c) | |||
set(SSUMKERNEL sum.S) | |||
set(DSUMKERNEL sum.S) | |||
set(CSUMKERNEL zsum.S) | |||
set(ZSUMKERNEL zsum.S) | |||
set(QSUMKERNEL sum.S) | |||
set(XSUMKERNEL zsum.S) | |||
endmacro () | |||
macro(SetDefaultL2) | |||
@@ -162,4 +168,4 @@ macro(SetDefaultL3) | |||
set(DGEADD_KERNEL ../generic/geadd.c) | |||
set(CGEADD_KERNEL ../generic/zgeadd.c) | |||
set(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
endmacro () | |||
endmacro () |
@@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") | |||
set(NO_EXPRECISION 1) | |||
endif () | |||
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") | |||
set(EXTRALIB "${EXTRALIB} -lm") | |||
set(NO_EXPRECISION 1) | |||
endif () | |||
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") | |||
set(EXTRALIB "${EXTRALIB} -lm") | |||
endif () | |||
@@ -87,13 +87,18 @@ endif () | |||
# Cannot run getarch on target if we are cross-compiling | |||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) | |||
# Write to config as getarch would | |||
if (DEFINED TARGET_CORE) | |||
set(TCORE ${TARGET_CORE}) | |||
else() | |||
set(TCORE ${CORE}) | |||
endif() | |||
# TODO: Set up defines that getarch sets up based on every other target | |||
# Perhaps this should be inside a different file as it grows larger | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define ${CORE}\n" | |||
"#define CHAR_CORENAME \"${CORE}\"\n") | |||
if ("${CORE}" STREQUAL "ARMV7") | |||
"#define ${TCORE}\n" | |||
"#define CHAR_CORENAME \"${TCORE}\"\n") | |||
if ("${TCORE}" STREQUAL "ARMV7") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_DATA_SIZE\t65536\n" | |||
"#define L1_DATA_LINESIZE\t32\n" | |||
@@ -108,7 +113,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
set(SGEMM_UNROLL_N 4) | |||
set(DGEMM_UNROLL_M 4) | |||
set(DGEMM_UNROLL_N 4) | |||
elseif ("${CORE}" STREQUAL "ARMV8") | |||
elseif ("${TCORE}" STREQUAL "ARMV8") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_DATA_SIZE\t32768\n" | |||
"#define L1_DATA_LINESIZE\t64\n" | |||
@@ -118,9 +123,16 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
"#define DTB_SIZE\t4096\n" | |||
"#define L2_ASSOCIATIVE\t32\n" | |||
"#define ARMV8\n") | |||
set(SGEMM_UNROLL_M 4) | |||
set(SGEMM_UNROLL_M 16) | |||
set(SGEMM_UNROLL_N 4) | |||
elseif ("${CORE}" STREQUAL "CORTEXA57" OR "${CORE}" STREQUAL "CORTEXA53") | |||
set(DGEMM_UNROLL_M 8) | |||
set(DGEMM_UNROLL_N 4) | |||
set(CGEMM_UNROLL_M 8) | |||
set(CGEMM_UNROLL_N 4) | |||
set(ZGEMM_UNROLL_M 4) | |||
set(ZGEMM_UNROLL_N 4) | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_CODE_SIZE\t32768\n" | |||
"#define L1_CODE_LINESIZE\t64\n" | |||
@@ -144,9 +156,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
set(DGEMM_UNROLL_N 4) | |||
set(CGEMM_UNROLL_M 8) | |||
set(CGEMM_UNROLL_N 4) | |||
set(ZGEMM_UNROLL_M 8) | |||
set(ZGEMM_UNROLL_M 4) | |||
set(ZGEMM_UNROLL_N 4) | |||
elseif ("${CORE}" STREQUAL "CORTEXA72" OR "${CORE}" STREQUAL "CORTEXA73") | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_CODE_SIZE\t49152\n" | |||
"#define L1_CODE_LINESIZE\t64\n" | |||
@@ -170,9 +183,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
set(DGEMM_UNROLL_N 4) | |||
set(CGEMM_UNROLL_M 8) | |||
set(CGEMM_UNROLL_N 4) | |||
set(ZGEMM_UNROLL_M 8) | |||
set(ZGEMM_UNROLL_M 4) | |||
set(ZGEMM_UNROLL_N 4) | |||
elseif ("${CORE}" STREQUAL "FALKOR") | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "FALKOR") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_CODE_SIZE\t65536\n" | |||
"#define L1_CODE_LINESIZE\t64\n" | |||
@@ -196,9 +210,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
set(DGEMM_UNROLL_N 4) | |||
set(CGEMM_UNROLL_M 8) | |||
set(CGEMM_UNROLL_N 4) | |||
set(ZGEMM_UNROLL_M 8) | |||
set(ZGEMM_UNROLL_M 4) | |||
set(ZGEMM_UNROLL_N 4) | |||
elseif ("${CORE}" STREQUAL "THUNDERX) | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "THUNDERX") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_CODE_SIZE\t32768\n" | |||
"#define L1_CODE_LINESIZE\t64\n" | |||
@@ -224,7 +239,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
set(CGEMM_UNROLL_N 2) | |||
set(ZGEMM_UNROLL_M 2) | |||
set(ZGEMM_UNROLL_N 2) | |||
elseif ("${CORE}" STREQUAL "THUNDERX2T99) | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "THUNDERX2T99") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_CODE_SIZE\t32768\n" | |||
"#define L1_CODE_LINESIZE\t64\n" | |||
@@ -240,7 +256,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
"#define L3_ASSOCIATIVE\t32\n" | |||
"#define DTB_DEFAULT_ENTRIES\t64\n" | |||
"#define DTB_SIZE\t4096\n" | |||
"#define VULCAN\n") | |||
"#define ARMV8\n") | |||
set(SGEMM_UNROLL_M 16) | |||
set(SGEMM_UNROLL_N 4) | |||
set(DGEMM_UNROLL_M 8) | |||
@@ -249,6 +265,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
set(CGEMM_UNROLL_N 4) | |||
set(ZGEMM_UNROLL_M 4) | |||
set(ZGEMM_UNROLL_N 4) | |||
set(SYMV_P 16) | |||
endif() | |||
# Or should this actually be NUM_CORES? | |||
@@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||
set(TARGET "BARCELONA") | |||
endif () | |||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") | |||
set(TARGET "ARMV7") | |||
endif () | |||
endif () | |||
if (DEFINED TARGET) | |||
@@ -184,6 +187,13 @@ if (DYNAMIC_ARCH) | |||
endif () | |||
endif () | |||
if (DYNAMIC_LIST) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST") | |||
foreach(DCORE ${DYNAMIC_LIST}) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}") | |||
endforeach () | |||
endif () | |||
if (NO_LAPACK) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") | |||
#Disable LAPACK C interface | |||
@@ -39,13 +39,21 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | |||
set(MIPS64 1) | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | |||
set(X86_64 1) | |||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
set(X86_64 1) | |||
else() | |||
set(X86 1) | |||
endif() | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") | |||
set(X86 1) | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") | |||
set(ARM 1) | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") | |||
set(ARM64 1) | |||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
set(ARM64 1) | |||
else() | |||
set(ARM 1) | |||
endif() | |||
endif() | |||
if (X86_64) | |||
@@ -78,7 +86,7 @@ endif() | |||
if (X86_64 OR X86) | |||
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||
if (NO_AVX512 EQUAL 1) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | |||
endif() | |||
@@ -85,6 +85,8 @@ extern "C" { | |||
#if !defined(_MSC_VER) | |||
#include <unistd.h> | |||
#elif _MSC_VER < 1900 | |||
#define snprintf _snprintf | |||
#endif | |||
#include <time.h> | |||
@@ -348,6 +350,11 @@ typedef int blasint; | |||
#endif | |||
#endif | |||
#ifdef POWER9 | |||
#ifndef YIELDING | |||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
#endif | |||
#endif | |||
/* | |||
#ifdef PILEDRIVER | |||
@@ -439,7 +446,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
typedef char env_var_t[MAX_PATH]; | |||
#define readenv(p, n) 0 | |||
#else | |||
#ifdef OS_WINDOWS | |||
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) | |||
typedef char env_var_t[MAX_PATH]; | |||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) | |||
#else | |||
@@ -19,6 +19,7 @@ | |||
#define CDOTC_K cdotc_k | |||
#define CNRM2_K cnrm2_k | |||
#define CSCAL_K cscal_k | |||
#define CSUM_K csum_k | |||
#define CSWAP_K cswap_k | |||
#define CROT_K csrot_k | |||
@@ -249,6 +250,7 @@ | |||
#define CDOTC_K gotoblas -> cdotc_k | |||
#define CNRM2_K gotoblas -> cnrm2_k | |||
#define CSCAL_K gotoblas -> cscal_k | |||
#define CSUM_K gotoblas -> csum_k | |||
#define CSWAP_K gotoblas -> cswap_k | |||
#define CROT_K gotoblas -> csrot_k | |||
@@ -19,6 +19,7 @@ | |||
#define DDOTC_K ddot_k | |||
#define DNRM2_K dnrm2_k | |||
#define DSCAL_K dscal_k | |||
#define DSUM_K dsum_k | |||
#define DSWAP_K dswap_k | |||
#define DROT_K drot_k | |||
@@ -174,6 +175,7 @@ | |||
#define DDOTC_K gotoblas -> ddot_k | |||
#define DNRM2_K gotoblas -> dnrm2_k | |||
#define DSCAL_K gotoblas -> dscal_k | |||
#define DSUM_K gotoblas -> dsum_k | |||
#define DSWAP_K gotoblas -> dswap_k | |||
#define DROT_K gotoblas -> drot_k | |||
@@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); | |||
double BLASFUNC(dzasum)(blasint *, double *, blasint *); | |||
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); | |||
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *); | |||
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *); | |||
double BLASFUNC(dsum) (blasint *, double *, blasint *); | |||
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *); | |||
double BLASFUNC(dzsum)(blasint *, double *, blasint *); | |||
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *); | |||
blasint BLASFUNC(isamax)(blasint *, float *, blasint *); | |||
blasint BLASFUNC(idamax)(blasint *, double *, blasint *); | |||
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); | |||
@@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG); | |||
double zasum_k (BLASLONG, double *, BLASLONG); | |||
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); | |||
float ssum_k (BLASLONG, float *, BLASLONG); | |||
double dsum_k (BLASLONG, double *, BLASLONG); | |||
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG); | |||
float csum_k (BLASLONG, float *, BLASLONG); | |||
double zsum_k (BLASLONG, double *, BLASLONG); | |||
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG); | |||
float samax_k (BLASLONG, float *, BLASLONG); | |||
double damax_k (BLASLONG, double *, BLASLONG); | |||
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); | |||
@@ -66,6 +66,7 @@ | |||
#define DOTC_K QDOTC_K | |||
#define NRM2_K QNRM2_K | |||
#define SCAL_K QSCAL_K | |||
#define SUM_K QSUM_K | |||
#define SWAP_K QSWAP_K | |||
#define ROT_K QROT_K | |||
@@ -356,6 +357,7 @@ | |||
#define DOTC_K DDOTC_K | |||
#define NRM2_K DNRM2_K | |||
#define SCAL_K DSCAL_K | |||
#define SUM_K DSUM_K | |||
#define SWAP_K DSWAP_K | |||
#define ROT_K DROT_K | |||
@@ -658,6 +660,7 @@ | |||
#define DOTC_K SDOTC_K | |||
#define NRM2_K SNRM2_K | |||
#define SCAL_K SSCAL_K | |||
#define SUM_K SSUM_K | |||
#define SWAP_K SSWAP_K | |||
#define ROT_K SROT_K | |||
@@ -962,6 +965,7 @@ | |||
#define DOTC_K XDOTC_K | |||
#define NRM2_K XNRM2_K | |||
#define SCAL_K XSCAL_K | |||
#define SUM_K XSUM_K | |||
#define SWAP_K XSWAP_K | |||
#define ROT_K XROT_K | |||
@@ -1363,6 +1367,7 @@ | |||
#define DOTC_K ZDOTC_K | |||
#define NRM2_K ZNRM2_K | |||
#define SCAL_K ZSCAL_K | |||
#define SUM_K ZSUM_K | |||
#define SWAP_K ZSWAP_K | |||
#define ROT_K ZROT_K | |||
@@ -1785,6 +1790,7 @@ | |||
#define DOTC_K CDOTC_K | |||
#define NRM2_K CNRM2_K | |||
#define SCAL_K CSCAL_K | |||
#define SUM_K CSUM_K | |||
#define SWAP_K CSWAP_K | |||
#define ROT_K CROT_K | |||
@@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
float (*snrm2_k) (BLASLONG, float *, BLASLONG); | |||
float (*sasum_k) (BLASLONG, float *, BLASLONG); | |||
float (*ssum_k) (BLASLONG, float *, BLASLONG); | |||
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
@@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
double (*dnrm2_k) (BLASLONG, double *, BLASLONG); | |||
double (*dasum_k) (BLASLONG, double *, BLASLONG); | |||
double (*dsum_k) (BLASLONG, double *, BLASLONG); | |||
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | |||
@@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); | |||
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | |||
@@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
float (*cnrm2_k) (BLASLONG, float *, BLASLONG); | |||
float (*casum_k) (BLASLONG, float *, BLASLONG); | |||
float (*csum_k) (BLASLONG, float *, BLASLONG); | |||
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
@@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); | |||
double (*znrm2_k) (BLASLONG, double *, BLASLONG); | |||
double (*zasum_k) (BLASLONG, double *, BLASLONG); | |||
double (*zsum_k) (BLASLONG, double *, BLASLONG); | |||
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
@@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); | |||
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
@@ -39,7 +39,7 @@ | |||
#ifndef COMMON_POWER | |||
#define COMMON_POWER | |||
#if defined(POWER8) | |||
#if defined(POWER8) || defined(POWER9) | |||
#define MB __asm__ __volatile__ ("eieio":::"memory") | |||
#define WMB __asm__ __volatile__ ("eieio":::"memory") | |||
#else | |||
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
#define HAVE_PREFETCH | |||
#endif | |||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) | |||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) ) | |||
#define DCBT_ARG 0 | |||
#else | |||
#define DCBT_ARG 8 | |||
@@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
#define L1_PREFETCH dcbtst | |||
#endif | |||
#if defined(POWER8) | |||
#if defined(POWER8) || defined(POWER9) | |||
#define L1_DUALFETCH | |||
#define L1_PREFETCHSIZE (16 + 128 * 100) | |||
#define L1_PREFETCH dcbtst | |||
@@ -598,9 +598,14 @@ REALNAME:;\ | |||
#ifndef __64BIT__ | |||
#define PROLOGUE \ | |||
.machine "any";\ | |||
.toc;\ | |||
.globl .REALNAME;\ | |||
.globl REALNAME;\ | |||
.csect REALNAME[DS],3;\ | |||
REALNAME:;\ | |||
.long .REALNAME, TOC[tc0], 0;\ | |||
.csect .text[PR],5;\ | |||
.REALNAME:; | |||
.REALNAME: | |||
#define EPILOGUE \ | |||
_section_.text:;\ | |||
@@ -611,9 +616,14 @@ _section_.text:;\ | |||
#define PROLOGUE \ | |||
.machine "any";\ | |||
.toc;\ | |||
.globl .REALNAME;\ | |||
.globl REALNAME;\ | |||
.csect REALNAME[DS],3;\ | |||
REALNAME:;\ | |||
.llong .REALNAME, TOC[tc0], 0;\ | |||
.csect .text[PR], 5;\ | |||
.REALNAME:; | |||
.REALNAME: | |||
#define EPILOGUE \ | |||
_section_.text:;\ | |||
@@ -802,7 +812,7 @@ Lmcount$lazy_ptr: | |||
#define BUFFER_SIZE ( 2 << 20) | |||
#elif defined(PPC440FP2) | |||
#define BUFFER_SIZE ( 16 << 20) | |||
#elif defined(POWER8) | |||
#elif defined(POWER8) || defined(POWER9) | |||
#define BUFFER_SIZE ( 64 << 20) | |||
#else | |||
#define BUFFER_SIZE ( 16 << 20) | |||
@@ -19,6 +19,7 @@ | |||
#define QDOTC_K qdot_k | |||
#define QNRM2_K qnrm2_k | |||
#define QSCAL_K qscal_k | |||
#define QSUM_K qsum_k | |||
#define QSWAP_K qswap_k | |||
#define QROT_K qrot_k | |||
@@ -161,6 +162,7 @@ | |||
#define QDOTC_K gotoblas -> qdot_k | |||
#define QNRM2_K gotoblas -> qnrm2_k | |||
#define QSCAL_K gotoblas -> qscal_k | |||
#define QSUM_K gotoblas -> qsum_k | |||
#define QSWAP_K gotoblas -> qswap_k | |||
#define QROT_K gotoblas -> qrot_k | |||
@@ -12,6 +12,7 @@ | |||
#define ISMAX_K ismax_k | |||
#define ISMIN_K ismin_k | |||
#define SASUM_K sasum_k | |||
#define SSUM_K ssum_k | |||
#define SAXPYU_K saxpy_k | |||
#define SAXPYC_K saxpy_k | |||
#define SCOPY_K scopy_k | |||
@@ -170,6 +171,7 @@ | |||
#define ISMAX_K gotoblas -> ismax_k | |||
#define ISMIN_K gotoblas -> ismin_k | |||
#define SASUM_K gotoblas -> sasum_k | |||
#define SSUM_K gotoblas -> ssum_k | |||
#define SAXPYU_K gotoblas -> saxpy_k | |||
#define SAXPYC_K gotoblas -> saxpy_k | |||
#define SCOPY_K gotoblas -> scopy_k | |||
@@ -19,6 +19,7 @@ | |||
#define XDOTC_K xdotc_k | |||
#define XNRM2_K xnrm2_k | |||
#define XSCAL_K xscal_k | |||
#define XSUM_K xsum_k | |||
#define XSWAP_K xswap_k | |||
#define XROT_K xqrot_k | |||
@@ -227,6 +228,7 @@ | |||
#define XDOTC_K gotoblas -> xdotc_k | |||
#define XNRM2_K gotoblas -> xnrm2_k | |||
#define XSCAL_K gotoblas -> xscal_k | |||
#define XSUM_K gotoblas -> xsum_k | |||
#define XSWAP_K gotoblas -> xswap_k | |||
#define XROT_K gotoblas -> xqrot_k | |||
@@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
y = blas_quick_divide_table[y]; | |||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y)); | |||
return result; | |||
#endif | |||
@@ -134,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ | |||
"=b" (*ebx), | |||
"=c" (*ecx), | |||
"=d" (*edx) | |||
: "0" (op)); | |||
: "0" (op), "c"(0)); | |||
#endif | |||
} | |||
@@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
y = blas_quick_divide_table[y]; | |||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); | |||
return result; | |||
} | |||
@@ -19,6 +19,7 @@ | |||
#define ZDOTC_K zdotc_k | |||
#define ZNRM2_K znrm2_k | |||
#define ZSCAL_K zscal_k | |||
#define ZSUM_K zsum_k | |||
#define ZSWAP_K zswap_k | |||
#define ZROT_K zdrot_k | |||
@@ -249,6 +250,7 @@ | |||
#define ZDOTC_K gotoblas -> zdotc_k | |||
#define ZNRM2_K gotoblas -> znrm2_k | |||
#define ZSCAL_K gotoblas -> zscal_k | |||
#define ZSUM_K gotoblas -> zsum_k | |||
#define ZSWAP_K gotoblas -> zswap_k | |||
#define ZROT_K gotoblas -> zdrot_k | |||
@@ -53,6 +53,7 @@ | |||
#define VENDOR_SIS 8 | |||
#define VENDOR_TRANSMETA 9 | |||
#define VENDOR_NSC 10 | |||
#define VENDOR_HYGON 11 | |||
#define VENDOR_UNKNOWN 99 | |||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | |||
@@ -116,6 +117,7 @@ | |||
#define CORE_EXCAVATOR 26 | |||
#define CORE_ZEN 27 | |||
#define CORE_SKYLAKEX 28 | |||
#define CORE_DHYANA 29 | |||
#define HAVE_SSE (1 << 0) | |||
#define HAVE_SSE2 (1 << 1) | |||
@@ -139,6 +141,7 @@ | |||
#define HAVE_FMA4 (1 << 19) | |||
#define HAVE_FMA3 (1 << 20) | |||
#define HAVE_AVX512VL (1 << 21) | |||
#define HAVE_AVX2 (1 << 22) | |||
#define CACHE_INFO_L1_I 1 | |||
#define CACHE_INFO_L1_D 2 | |||
@@ -214,5 +217,8 @@ typedef struct { | |||
#define CPUTYPE_EXCAVATOR 50 | |||
#define CPUTYPE_ZEN 51 | |||
#define CPUTYPE_SKYLAKEX 52 | |||
#define CPUTYPE_DHYANA 53 | |||
#define CPUTYPE_HYGON_UNKNOWN 54 | |||
#endif |
@@ -39,6 +39,8 @@ | |||
// Cavium | |||
#define CPU_THUNDERX 7 | |||
#define CPU_THUNDERX2T99 8 | |||
//Hisilicon | |||
#define CPU_TSV110 9 | |||
static char *cpuname[] = { | |||
"UNKNOWN", | |||
@@ -49,7 +51,8 @@ static char *cpuname[] = { | |||
"CORTEXA73", | |||
"FALKOR", | |||
"THUNDERX", | |||
"THUNDERX2T99" | |||
"THUNDERX2T99", | |||
"TSV110" | |||
}; | |||
static char *cpuname_lower[] = { | |||
@@ -61,7 +64,8 @@ static char *cpuname_lower[] = { | |||
"cortexa73", | |||
"falkor", | |||
"thunderx", | |||
"thunderx2t99" | |||
"thunderx2t99", | |||
"tsv110" | |||
}; | |||
int get_feature(char *search) | |||
@@ -145,6 +149,9 @@ int detect(void) | |||
return CPU_THUNDERX; | |||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) | |||
return CPU_THUNDERX2T99; | |||
// HiSilicon | |||
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) | |||
return CPU_TSV110; | |||
} | |||
p = (char *) NULL ; | |||
@@ -286,6 +293,21 @@ void get_cpuconfig(void) | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
case CPU_TSV110: | |||
printf("#define TSV110 \n"); | |||
printf("#define L1_CODE_SIZE 65536 \n"); | |||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 4 \n"); | |||
printf("#define L1_DATA_SIZE 65536 \n"); | |||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 4 \n"); | |||
printf("#define L2_SIZE 524228 \n"); | |||
printf("#define L2_LINESIZE 64 \n"); | |||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
} | |||
} | |||
@@ -94,7 +94,7 @@ char *corename[] = { | |||
"CELL", | |||
"PPCG4", | |||
"POWER8", | |||
"POWER8" | |||
"POWER9" | |||
}; | |||
int detect(void){ | |||
@@ -124,7 +124,7 @@ int detect(void){ | |||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | |||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; | |||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
@@ -156,7 +156,7 @@ int detect(void){ | |||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | |||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; | |||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
return CPUTYPE_POWER5; | |||
@@ -180,7 +180,7 @@ int id; | |||
__asm __volatile("mfpvr %0" : "=r"(id)); | |||
switch ( id >> 16 ) { | |||
case 0x4e: // POWER9 | |||
return CPUTYPE_POWER8; | |||
return CPUTYPE_POWER9; | |||
break; | |||
case 0x4d: | |||
case 0x4b: // POWER8/8E | |||
@@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ | |||
("mov %%ebx, %%edi;" | |||
"cpuid;" | |||
"xchgl %%ebx, %%edi;" | |||
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); | |||
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc"); | |||
#else | |||
__asm__ __volatile__ | |||
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); | |||
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc"); | |||
#endif | |||
} | |||
@@ -211,6 +211,44 @@ int support_avx(){ | |||
#endif | |||
} | |||
int support_avx2(){ | |||
#ifndef NO_AVX2 | |||
int eax, ebx, ecx=0, edx; | |||
int ret=0; | |||
if (!support_avx()) | |||
return 0; | |||
cpuid(7, &eax, &ebx, &ecx, &edx); | |||
if((ebx & (1<<7)) != 0) | |||
ret=1; //OS supports AVX2 | |||
return ret; | |||
#else | |||
return 0; | |||
#endif | |||
} | |||
int support_avx512(){ | |||
#if !defined(NO_AVX) && !defined(NO_AVX512) | |||
int eax, ebx, ecx, edx; | |||
int ret=0; | |||
if (!support_avx()) | |||
return 0; | |||
cpuid(7, &eax, &ebx, &ecx, &edx); | |||
if((ebx & 32) != 32){ | |||
ret=0; //OS does not even support AVX2 | |||
} | |||
if((ebx & (1<<31)) != 0){ | |||
xgetbv(0, &eax, &edx); | |||
if((eax & 0xe0) == 0xe0) | |||
ret=1; //OS supports AVX512VL | |||
} | |||
return ret; | |||
#else | |||
return 0; | |||
#endif | |||
} | |||
int get_vendor(void){ | |||
int eax, ebx, ecx, edx; | |||
@@ -233,6 +271,7 @@ int get_vendor(void){ | |||
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; | |||
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; | |||
if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; | |||
if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON; | |||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | |||
@@ -294,6 +333,8 @@ int get_cputype(int gettype){ | |||
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | |||
#ifndef NO_AVX | |||
if (support_avx()) feature |= HAVE_AVX; | |||
if (support_avx2()) feature |= HAVE_AVX2; | |||
if (support_avx512()) feature |= HAVE_AVX512VL; | |||
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; | |||
#endif | |||
@@ -1006,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||
} | |||
} | |||
if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { | |||
if ((get_vendor() == VENDOR_AMD) || | |||
(get_vendor() == VENDOR_HYGON) || | |||
(get_vendor() == VENDOR_CENTAUR)) { | |||
cpuid(0x80000005, &eax, &ebx, &ecx, &edx); | |||
LDTB.size = 4096; | |||
@@ -1228,22 +1271,18 @@ int get_cpuname(void){ | |||
return CPUTYPE_NEHALEM; | |||
case 12: | |||
case 15: | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
#else | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 13: | |||
//Broadwell | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
#else | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CPUTYPE_NEHALEM; | |||
} | |||
@@ -1252,33 +1291,27 @@ int get_cpuname(void){ | |||
switch (model) { | |||
case 5: | |||
case 6: | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
#else | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 7: | |||
case 15: | |||
//Broadwell | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
#else | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 14: | |||
//Skylake | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
#else | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 12: | |||
@@ -1292,80 +1325,66 @@ int get_cpuname(void){ | |||
switch (model) { | |||
case 6: | |||
//Broadwell | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
#else | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 5: | |||
// Skylake X | |||
#ifndef NO_AVX512 | |||
return CPUTYPE_SKYLAKEX; | |||
#else | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
return CPUTYPE_HASWELL; | |||
#else | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
if(support_avx512()) | |||
return CPUTYPE_SKYLAKEX; | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
#endif | |||
case 14: | |||
// Skylake | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
#else | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 7: | |||
// Xeon Phi Knights Landing | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
#else | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 12: | |||
// Apollo Lake | |||
case 15: | |||
// Denverton | |||
return CPUTYPE_NEHALEM; | |||
} | |||
break; | |||
case 6: | |||
switch (model) { | |||
case 6: // Cannon Lake | |||
#ifndef NO_AVX512 | |||
return CPUTYPE_SKYLAKEX; | |||
#else | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
return CPUTYPE_HASWELL; | |||
#else | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
if(support_avx512()) | |||
return CPUTYPE_SKYLAKEX; | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
#endif | |||
} | |||
break; | |||
case 9: | |||
case 8: | |||
case 8: | |||
switch (model) { | |||
case 14: // Kaby Lake | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
case 14: // Kaby Lake and refreshes | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
#else | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CPUTYPE_NEHALEM; | |||
} | |||
@@ -1469,6 +1488,26 @@ int get_cpuname(void){ | |||
return CPUTYPE_AMD_UNKNOWN; | |||
} | |||
if (vendor == VENDOR_HYGON){ | |||
switch (family) { | |||
case 0xf: | |||
switch (exfamily) { | |||
case 9: | |||
//Hygon Dhyana | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
return CPUTYPE_ZEN; | |||
#else | |||
return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator | |||
#endif | |||
else | |||
return CPUTYPE_BARCELONA; | |||
} | |||
break; | |||
} | |||
return CPUTYPE_HYGON_UNKNOWN; | |||
} | |||
if (vendor == VENDOR_CYRIX){ | |||
switch (family) { | |||
case 0x4: | |||
@@ -1590,7 +1629,8 @@ static char *cpuname[] = { | |||
"STEAMROLLER", | |||
"EXCAVATOR", | |||
"ZEN", | |||
"SKYLAKEX" | |||
"SKYLAKEX", | |||
"DHYANA" | |||
}; | |||
static char *lowercpuname[] = { | |||
@@ -1645,7 +1685,8 @@ static char *lowercpuname[] = { | |||
"steamroller", | |||
"excavator", | |||
"zen", | |||
"skylakex" | |||
"skylakex", | |||
"dhyana" | |||
}; | |||
static char *corename[] = { | |||
@@ -1677,7 +1718,8 @@ static char *corename[] = { | |||
"STEAMROLLER", | |||
"EXCAVATOR", | |||
"ZEN", | |||
"SKYLAKEX" | |||
"SKYLAKEX", | |||
"DHYANA" | |||
}; | |||
static char *corename_lower[] = { | |||
@@ -1709,7 +1751,8 @@ static char *corename_lower[] = { | |||
"steamroller", | |||
"excavator", | |||
"zen", | |||
"skylakex" | |||
"skylakex", | |||
"dhyana" | |||
}; | |||
@@ -2026,6 +2069,23 @@ int get_coretype(void){ | |||
} | |||
} | |||
if (vendor == VENDOR_HYGON){ | |||
if (family == 0xf){ | |||
if (exfamily == 9) { | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
return CORE_ZEN; | |||
#else | |||
return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator | |||
#endif | |||
else | |||
return CORE_BARCELONA; | |||
} else { | |||
return CORE_BARCELONA; | |||
} | |||
} | |||
} | |||
if (vendor == VENDOR_CENTAUR) { | |||
switch (family) { | |||
case 0x6: | |||
@@ -2112,6 +2172,8 @@ void get_cpuconfig(void){ | |||
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | |||
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | |||
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | |||
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); | |||
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); | |||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | |||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | |||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||
@@ -2180,6 +2242,8 @@ void get_sse(void){ | |||
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | |||
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | |||
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | |||
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); | |||
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); | |||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | |||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | |||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||
@@ -27,9 +27,9 @@ | |||
#include <string.h> | |||
#define CPU_GENERIC 0 | |||
#define CPU_Z13 1 | |||
#define CPU_Z14 2 | |||
#define CPU_GENERIC 0 | |||
#define CPU_Z13 1 | |||
#define CPU_Z14 2 | |||
static char *cpuname[] = { | |||
"ZARCH_GENERIC", | |||
@@ -64,10 +64,8 @@ int detect(void) | |||
if (strstr(p, "2964")) return CPU_Z13; | |||
if (strstr(p, "2965")) return CPU_Z13; | |||
/* detect z14, but fall back to z13 */ | |||
if (strstr(p, "3906")) return CPU_Z13; | |||
if (strstr(p, "3907")) return CPU_Z13; | |||
if (strstr(p, "3906")) return CPU_Z14; | |||
if (strstr(p, "3907")) return CPU_Z14; | |||
return CPU_GENERIC; | |||
} | |||
@@ -116,7 +114,14 @@ void get_cpuconfig(void) | |||
break; | |||
case CPU_Z14: | |||
printf("#define Z14\n"); | |||
printf("#define L1_DATA_SIZE 131072\n"); | |||
printf("#define L1_DATA_LINESIZE 256\n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 8\n"); | |||
printf("#define L2_SIZE 4194304\n"); | |||
printf("#define L2_LINESIZE 256\n"); | |||
printf("#define L2_ASSOCIATIVE 8\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
break; | |||
} | |||
} |
@@ -113,7 +113,7 @@ ARCH_X86 | |||
ARCH_X86_64 | |||
#endif | |||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) | |||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__) | |||
ARCH_POWER | |||
#endif | |||
@@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = trmv_kernel; | |||
@@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = trmv_kernel; | |||
@@ -18,8 +18,12 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
ifeq ($(ARCH),arm64) | |||
COMMONOBJS += dynamic_arm64.$(SUFFIX) | |||
else | |||
ifeq ($(ARCH),power) | |||
COMMONOBJS += dynamic_power.$(SUFFIX) | |||
else | |||
COMMONOBJS += dynamic.$(SUFFIX) | |||
endif | |||
endif | |||
else | |||
COMMONOBJS += parameter.$(SUFFIX) | |||
endif | |||
@@ -78,8 +82,12 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
ifeq ($(ARCH),arm64) | |||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) | |||
else | |||
ifeq ($(ARCH),power) | |||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX) | |||
else | |||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | |||
endif | |||
endif | |||
else | |||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | |||
endif | |||
@@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
SetEvent(pool.killed); | |||
for(i = 0; i < blas_num_threads - 1; i++){ | |||
// Could also just use WaitForMultipleObjects | |||
WaitForSingleObject(blas_threads[i], 5); //INFINITE); | |||
#ifndef OS_WINDOWSSTORE | |||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP | |||
TerminateThread(blas_threads[i],0); | |||
#endif | |||
CloseHandle(blas_threads[i]); | |||
} | |||
CloseHandle(pool.filled); | |||
CloseHandle(pool.killed); | |||
blas_server_avail = 0; | |||
} | |||
@@ -274,6 +274,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; | |||
#define VENDOR_INTEL 1 | |||
#define VENDOR_AMD 2 | |||
#define VENDOR_CENTAUR 3 | |||
#define VENDOR_HYGON 4 | |||
#define VENDOR_UNKNOWN 99 | |||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | |||
@@ -304,9 +305,49 @@ int support_avx(){ | |||
#endif | |||
} | |||
int support_avx2(){ | |||
#ifndef NO_AVX2 | |||
int eax, ebx, ecx=0, edx; | |||
int ret=0; | |||
if (!support_avx()) | |||
return 0; | |||
cpuid(7, &eax, &ebx, &ecx, &edx); | |||
if((ebx & (1<<7)) != 0) | |||
ret=1; //OS supports AVX2 | |||
return ret; | |||
#else | |||
return 0; | |||
#endif | |||
} | |||
int support_avx512(){ | |||
#if !defined(NO_AVX) && !defined(NO_AVX512) | |||
int eax, ebx, ecx, edx; | |||
int ret=0; | |||
if (!support_avx()) | |||
return 0; | |||
cpuid(7, &eax, &ebx, &ecx, &edx); | |||
if((ebx & (1<<7)) != 1){ | |||
ret=0; //OS does not even support AVX2 | |||
} | |||
if((ebx & (1<<31)) != 0){ | |||
xgetbv(0, &eax, &edx); | |||
if((eax & 0xe0) == 0xe0) | |||
ret=1; //OS supports AVX512VL | |||
} | |||
return ret; | |||
#else | |||
return 0; | |||
#endif | |||
} | |||
extern void openblas_warning(int verbose, const char * msg); | |||
#define FALLBACK_VERBOSE 1 | |||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" | |||
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" | |||
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" | |||
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" | |||
static int get_vendor(void){ | |||
@@ -329,6 +370,7 @@ static int get_vendor(void){ | |||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; | |||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; | |||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; | |||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; | |||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | |||
@@ -403,18 +445,24 @@ static gotoblas_t *get_coretype(void){ | |||
} | |||
//Intel Haswell | |||
if (model == 12 || model == 15) { | |||
if(support_avx()) | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
else{ | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
//Intel Broadwell | |||
if (model == 13) { | |||
if(support_avx()) | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
else{ | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
@@ -424,27 +472,36 @@ static gotoblas_t *get_coretype(void){ | |||
case 4: | |||
//Intel Haswell | |||
if (model == 5 || model == 6) { | |||
if(support_avx()) | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
else{ | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
//Intel Broadwell | |||
if (model == 7 || model == 15) { | |||
if(support_avx()) | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
else{ | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
//Intel Skylake | |||
if (model == 14) { | |||
if(support_avx()) | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
else{ | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
@@ -457,72 +514,86 @@ static gotoblas_t *get_coretype(void){ | |||
case 5: | |||
//Intel Broadwell | |||
if (model == 6) { | |||
if(support_avx()) | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
else{ | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
if (model == 5) { | |||
// Intel Skylake X | |||
#ifndef NO_AVX512 | |||
return &gotoblas_SKYLAKEX; | |||
#else | |||
if(support_avx()) | |||
if (support_avx512()) | |||
return &gotoblas_SKYLAKEX; | |||
if(support_avx2()){ | |||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
return &gotoblas_HASWELL; | |||
else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; | |||
} | |||
#endif | |||
} | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; | |||
} | |||
} | |||
//Intel Skylake | |||
if (model == 14) { | |||
if(support_avx()) | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
else{ | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
//Intel Phi Knights Landing | |||
if (model == 7) { | |||
if(support_avx()) | |||
if(support_avx2()){ | |||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
return &gotoblas_HASWELL; | |||
else{ | |||
} | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
//Apollo Lake | |||
if (model == 12) { | |||
//Apollo Lake or Denverton | |||
if (model == 12 || model == 15) { | |||
return &gotoblas_NEHALEM; | |||
} | |||
return NULL; | |||
case 6: | |||
if (model == 6) { | |||
// Cannon Lake | |||
#ifndef NO_AVX512 | |||
return &gotoblas_SKYLAKEX; | |||
#else | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
return &gotoblas_HASWELL; | |||
#else | |||
return &gotoblas_SANDYBRIDGE; | |||
#endif | |||
else | |||
return &gotoblas_NEHALEM; | |||
#endif | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; | |||
} | |||
} | |||
return NULL; | |||
case 9: | |||
case 8: | |||
if (model == 14 ) { // Kaby Lake | |||
if(support_avx()) | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
else{ | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
@@ -535,7 +606,7 @@ static gotoblas_t *get_coretype(void){ | |||
} | |||
} | |||
if (vendor == VENDOR_AMD){ | |||
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ | |||
if (family <= 0xe) { | |||
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon | |||
cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | |||
@@ -615,6 +686,13 @@ static gotoblas_t *get_coretype(void){ | |||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
} else if (exfamily == 9) { | |||
if(support_avx()) | |||
return &gotoblas_ZEN; | |||
else{ | |||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
} | |||
}else { | |||
return &gotoblas_BARCELONA; | |||
} | |||
@@ -0,0 +1,102 @@ | |||
#include "common.h" | |||
extern gotoblas_t gotoblas_POWER6; | |||
extern gotoblas_t gotoblas_POWER8; | |||
extern gotoblas_t gotoblas_POWER9; | |||
extern void openblas_warning(int verbose, const char *msg); | |||
static char *corename[] = { | |||
"unknown", | |||
"POWER6", | |||
"POWER8", | |||
"POWER9" | |||
}; | |||
#define NUM_CORETYPES 4 | |||
char *gotoblas_corename(void) { | |||
if (gotoblas == &gotoblas_POWER6) return corename[1]; | |||
if (gotoblas == &gotoblas_POWER8) return corename[2]; | |||
if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||
return corename[0]; | |||
} | |||
static gotoblas_t *get_coretype(void) { | |||
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) | |||
return &gotoblas_POWER6; | |||
if (__builtin_cpu_is("power8")) | |||
return &gotoblas_POWER8; | |||
if (__builtin_cpu_is("power9")) | |||
return &gotoblas_POWER9; | |||
return NULL; | |||
} | |||
static gotoblas_t *force_coretype(char * coretype) { | |||
int i ; | |||
int found = -1; | |||
char message[128]; | |||
for ( i = 0 ; i < NUM_CORETYPES; i++) | |||
{ | |||
if (!strncasecmp(coretype, corename[i], 20)) | |||
{ | |||
found = i; | |||
break; | |||
} | |||
} | |||
switch (found) | |||
{ | |||
case 1: return (&gotoblas_POWER6); | |||
case 2: return (&gotoblas_POWER8); | |||
case 3: return (&gotoblas_POWER9); | |||
default: return NULL; | |||
} | |||
snprintf(message, 128, "Core not found: %s\n", coretype); | |||
openblas_warning(1, message); | |||
} | |||
void gotoblas_dynamic_init(void) { | |||
char coremsg[128]; | |||
char coren[22]; | |||
char *p; | |||
if (gotoblas) return; | |||
p = getenv("OPENBLAS_CORETYPE"); | |||
if ( p ) | |||
{ | |||
gotoblas = force_coretype(p); | |||
} | |||
else | |||
{ | |||
gotoblas = get_coretype(); | |||
} | |||
if (gotoblas == NULL) | |||
{ | |||
snprintf(coremsg, 128, "Falling back to POWER8 core\n"); | |||
openblas_warning(1, coremsg); | |||
gotoblas = &gotoblas_POWER8; | |||
} | |||
if (gotoblas && gotoblas -> init) { | |||
strncpy(coren,gotoblas_corename(),20); | |||
sprintf(coremsg, "Core: %s\n",coren); | |||
openblas_warning(2, coremsg); | |||
gotoblas -> init(); | |||
} else { | |||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
exit(1); | |||
} | |||
} | |||
void gotoblas_dynamic_quit(void) { | |||
gotoblas = NULL; | |||
} |
@@ -198,45 +198,68 @@ int get_num_procs(void); | |||
#else | |||
int get_num_procs(void) { | |||
static int nums = 0; | |||
cpu_set_t *cpusetp; | |||
size_t size; | |||
int ret; | |||
int i,n; | |||
cpu_set_t cpuset,*cpusetp; | |||
size_t size; | |||
int ret; | |||
#if defined(__GLIBC_PREREQ) | |||
#if !__GLIBC_PREREQ(2, 7) | |||
int i; | |||
#if !__GLIBC_PREREQ(2, 6) | |||
int n; | |||
#endif | |||
#endif | |||
#endif | |||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
#if !defined(OS_LINUX) | |||
return nums; | |||
return nums; | |||
#endif | |||
#if !defined(__GLIBC_PREREQ) | |||
return nums; | |||
return nums; | |||
#else | |||
#if !__GLIBC_PREREQ(2, 3) | |||
return nums; | |||
return nums; | |||
#endif | |||
#if !__GLIBC_PREREQ(2, 7) | |||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||
if (ret!=0) return nums; | |||
n=0; | |||
#if !__GLIBC_PREREQ(2, 6) | |||
for (i=0;i<nums;i++) | |||
if (CPU_ISSET(i,cpusetp)) n++; | |||
if (CPU_ISSET(i,cpuset)) n++; | |||
nums=n; | |||
#else | |||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||
nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
#endif | |||
return nums; | |||
#else | |||
cpusetp = CPU_ALLOC(nums); | |||
if (cpusetp == NULL) return nums; | |||
size = CPU_ALLOC_SIZE(nums); | |||
ret = sched_getaffinity(0,size,cpusetp); | |||
if (ret!=0) return nums; | |||
ret = CPU_COUNT_S(size,cpusetp); | |||
if (ret > 0 && ret < nums) nums = ret; | |||
CPU_FREE(cpusetp); | |||
return nums; | |||
if (nums >= CPU_SETSIZE) { | |||
cpusetp = CPU_ALLOC(nums); | |||
if (cpusetp == NULL) { | |||
return nums; | |||
} | |||
size = CPU_ALLOC_SIZE(nums); | |||
ret = sched_getaffinity(0,size,cpusetp); | |||
if (ret!=0) { | |||
CPU_FREE(cpusetp); | |||
return nums; | |||
} | |||
ret = CPU_COUNT_S(size,cpusetp); | |||
if (ret > 0 && ret < nums) nums = ret; | |||
CPU_FREE(cpusetp); | |||
return nums; | |||
} else { | |||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||
if (ret!=0) { | |||
return nums; | |||
} | |||
ret = CPU_COUNT(&cpuset); | |||
if (ret > 0 && ret < nums) nums = ret; | |||
return nums; | |||
} | |||
#endif | |||
#endif | |||
} | |||
@@ -1073,11 +1096,6 @@ static volatile int memory_initialized = 0; | |||
} | |||
free(table); | |||
} | |||
#if defined(OS_WINDOWS) | |||
TlsFree(local_storage_key); | |||
#else | |||
pthread_key_delete(local_storage_key); | |||
#endif | |||
} | |||
static void blas_memory_init(){ | |||
@@ -1295,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) { | |||
free(map_address); | |||
} | |||
#ifdef SMP | |||
void blas_thread_memory_cleanup(void) { | |||
blas_memory_cleanup((void*)get_memory_table()); | |||
} | |||
#endif | |||
void blas_shutdown(void){ | |||
#ifdef SMP | |||
BLASFUNC(blas_thread_shutdown)(); | |||
@@ -1304,7 +1329,7 @@ void blas_shutdown(void){ | |||
/* Only cleanupIf we were built for threading and TLS was initialized */ | |||
if (local_storage_key) | |||
#endif | |||
blas_memory_cleanup((void*)get_memory_table()); | |||
blas_thread_memory_cleanup(); | |||
#ifdef SEEK_ADDRESS | |||
base_address = 0UL; | |||
@@ -1491,6 +1516,14 @@ void DESTRUCTOR gotoblas_quit(void) { | |||
blas_shutdown(); | |||
#if defined(SMP) | |||
#if defined(OS_WINDOWS) | |||
TlsFree(local_storage_key); | |||
#else | |||
pthread_key_delete(local_storage_key); | |||
#endif | |||
#endif | |||
#ifdef PROFILE | |||
moncontrol (0); | |||
#endif | |||
@@ -1526,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser | |||
break; | |||
case DLL_THREAD_DETACH: | |||
#if defined(SMP) | |||
blas_memory_cleanup((void*)get_memory_table()); | |||
blas_thread_memory_cleanup(); | |||
#endif | |||
break; | |||
case DLL_PROCESS_DETACH: | |||
@@ -1600,9 +1633,11 @@ void gotoblas_dummy_for_PGI(void) { | |||
#endif | |||
#else | |||
/* USE_TLS / COMPILE_TLS not set */ | |||
#include <errno.h> | |||
#ifdef OS_WINDOWS | |||
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) | |||
#define ALLOC_WINDOWS | |||
#ifndef MEM_LARGE_PAGES | |||
#define MEM_LARGE_PAGES 0x20000000 | |||
@@ -1616,7 +1651,7 @@ void gotoblas_dummy_for_PGI(void) { | |||
#include <stdio.h> | |||
#include <fcntl.h> | |||
#ifndef OS_WINDOWS | |||
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) | |||
#include <sys/mman.h> | |||
#ifndef NO_SYSV_IPC | |||
#include <sys/shm.h> | |||
@@ -1636,7 +1671,7 @@ void gotoblas_dummy_for_PGI(void) { | |||
#include <sys/resource.h> | |||
#endif | |||
#if defined(OS_FREEBSD) || defined(OS_DARWIN) | |||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) | |||
#include <sys/sysctl.h> | |||
#include <sys/resource.h> | |||
#endif | |||
@@ -1675,9 +1710,12 @@ void gotoblas_dummy_for_PGI(void) { | |||
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) | |||
#define CONSTRUCTOR __attribute__ ((constructor)) | |||
#define DESTRUCTOR __attribute__ ((destructor)) | |||
#else | |||
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) | |||
#define CONSTRUCTOR __attribute__ ((constructor(101))) | |||
#define DESTRUCTOR __attribute__ ((destructor(101))) | |||
#else | |||
#define CONSTRUCTOR __attribute__ ((constructor)) | |||
#define DESTRUCTOR __attribute__ ((destructor)) | |||
#endif | |||
#ifdef DYNAMIC_ARCH | |||
@@ -1701,45 +1739,70 @@ void goto_set_num_threads(int num_threads) {}; | |||
int get_num_procs(void); | |||
#else | |||
int get_num_procs(void) { | |||
static int nums = 0; | |||
cpu_set_t *cpusetp; | |||
size_t size; | |||
int ret; | |||
int i,n; | |||
cpu_set_t cpuset,*cpusetp; | |||
size_t size; | |||
int ret; | |||
#if defined(__GLIBC_PREREQ) | |||
#if !__GLIBC_PREREQ(2, 7) | |||
int i; | |||
#if !__GLIBC_PREREQ(2, 6) | |||
int n; | |||
#endif | |||
#endif | |||
#endif | |||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
#if !defined(OS_LINUX) | |||
return nums; | |||
return nums; | |||
#endif | |||
#if !defined(__GLIBC_PREREQ) | |||
return nums; | |||
return nums; | |||
#else | |||
#if !__GLIBC_PREREQ(2, 3) | |||
return nums; | |||
return nums; | |||
#endif | |||
#if !__GLIBC_PREREQ(2, 7) | |||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||
if (ret!=0) return nums; | |||
n=0; | |||
#if !__GLIBC_PREREQ(2, 6) | |||
for (i=0;i<nums;i++) | |||
if (CPU_ISSET(i,cpusetp)) n++; | |||
if (CPU_ISSET(i,cpuset)) n++; | |||
nums=n; | |||
#else | |||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||
nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
#endif | |||
return nums; | |||
#else | |||
cpusetp = CPU_ALLOC(nums); | |||
if (cpusetp == NULL) return nums; | |||
size = CPU_ALLOC_SIZE(nums); | |||
ret = sched_getaffinity(0,size,cpusetp); | |||
if (ret!=0) return nums; | |||
nums = CPU_COUNT_S(size,cpusetp); | |||
CPU_FREE(cpusetp); | |||
return nums; | |||
if (nums >= CPU_SETSIZE) { | |||
cpusetp = CPU_ALLOC(nums); | |||
if (cpusetp == NULL) { | |||
return nums; | |||
} | |||
size = CPU_ALLOC_SIZE(nums); | |||
ret = sched_getaffinity(0,size,cpusetp); | |||
if (ret!=0) { | |||
CPU_FREE(cpusetp); | |||
return nums; | |||
} | |||
ret = CPU_COUNT_S(size,cpusetp); | |||
if (ret > 0 && ret < nums) nums = ret; | |||
CPU_FREE(cpusetp); | |||
return nums; | |||
} else { | |||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||
if (ret!=0) { | |||
return nums; | |||
} | |||
ret = CPU_COUNT(&cpuset); | |||
if (ret > 0 && ret < nums) nums = ret; | |||
return nums; | |||
} | |||
#endif | |||
#endif | |||
} | |||
@@ -1753,7 +1816,7 @@ int get_num_procs(void) { | |||
return nums; | |||
} | |||
#endif | |||
#ifdef OS_HAIKU | |||
int get_num_procs(void) { | |||
static int nums = 0; | |||
@@ -1790,7 +1853,7 @@ int get_num_procs(void) { | |||
#endif | |||
#if defined(OS_FREEBSD) | |||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) | |||
int get_num_procs(void) { | |||
@@ -1867,7 +1930,7 @@ void openblas_fork_handler() | |||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 | |||
// In the mean time build with USE_OPENMP=0 or link against another | |||
// implementation of OpenMP. | |||
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) | |||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) | |||
int err; | |||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); | |||
if(err != 0) | |||
@@ -1880,7 +1943,7 @@ extern int openblas_goto_num_threads_env(); | |||
extern int openblas_omp_num_threads_env(); | |||
int blas_get_cpu_number(void){ | |||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
int max_num; | |||
#endif | |||
int blas_goto_num = 0; | |||
@@ -1888,11 +1951,11 @@ int blas_get_cpu_number(void){ | |||
if (blas_num_threads) return blas_num_threads; | |||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
max_num = get_num_procs(); | |||
#endif | |||
blas_goto_num = 0; | |||
// blas_goto_num = 0; | |||
#ifndef USE_OPENMP | |||
blas_goto_num=openblas_num_threads_env(); | |||
if (blas_goto_num < 0) blas_goto_num = 0; | |||
@@ -1904,7 +1967,7 @@ int blas_get_cpu_number(void){ | |||
#endif | |||
blas_omp_num = 0; | |||
// blas_omp_num = 0; | |||
blas_omp_num=openblas_omp_num_threads_env(); | |||
if (blas_omp_num < 0) blas_omp_num = 0; | |||
@@ -1912,7 +1975,7 @@ int blas_get_cpu_number(void){ | |||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | |||
else blas_num_threads = MAX_CPU_NUMBER; | |||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
if (blas_num_threads > max_num) blas_num_threads = max_num; | |||
#endif | |||
@@ -1999,11 +2062,15 @@ static void *alloc_mmap(void *address){ | |||
} | |||
if (map_address != (void *)-1) { | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
LOCK_COMMAND(&alloc_lock); | |||
#endif | |||
release_info[release_pos].address = map_address; | |||
release_info[release_pos].func = alloc_mmap_free; | |||
release_pos ++; | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#endif | |||
} | |||
#ifdef OS_LINUX | |||
@@ -2145,14 +2212,18 @@ static void *alloc_mmap(void *address){ | |||
#if defined(OS_LINUX) && !defined(NO_WARMUP) | |||
} | |||
#endif | |||
LOCK_COMMAND(&alloc_lock); | |||
if (map_address != (void *)-1) { | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
LOCK_COMMAND(&alloc_lock); | |||
#endif | |||
release_info[release_pos].address = map_address; | |||
release_info[release_pos].func = alloc_mmap_free; | |||
release_pos ++; | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#endif | |||
} | |||
UNLOCK_COMMAND(&alloc_lock); | |||
return map_address; | |||
} | |||
@@ -2520,7 +2591,7 @@ void *blas_memory_alloc(int procpos){ | |||
int position; | |||
#if defined(WHEREAMI) && !defined(USE_OPENMP) | |||
int mypos; | |||
int mypos = 0; | |||
#endif | |||
void *map_address; | |||
@@ -2551,6 +2622,11 @@ void *blas_memory_alloc(int procpos){ | |||
NULL, | |||
}; | |||
void *(**func)(void *address); | |||
#if defined(USE_OPENMP) | |||
if (!memory_initialized) { | |||
#endif | |||
LOCK_COMMAND(&alloc_lock); | |||
if (!memory_initialized) { | |||
@@ -2586,6 +2662,9 @@ void *blas_memory_alloc(int procpos){ | |||
} | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#if defined(USE_OPENMP) | |||
} | |||
#endif | |||
#ifdef DEBUG | |||
printf("Alloc Start ...\n"); | |||
@@ -2600,13 +2679,17 @@ void *blas_memory_alloc(int procpos){ | |||
do { | |||
if (!memory[position].used && (memory[position].pos == mypos)) { | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
LOCK_COMMAND(&alloc_lock); | |||
// blas_lock(&memory[position].lock); | |||
#else | |||
blas_lock(&memory[position].lock); | |||
#endif | |||
if (!memory[position].used) goto allocation; | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
// blas_unlock(&memory[position].lock); | |||
#else | |||
blas_unlock(&memory[position].lock); | |||
#endif | |||
} | |||
position ++; | |||
@@ -2618,21 +2701,26 @@ void *blas_memory_alloc(int procpos){ | |||
position = 0; | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
LOCK_COMMAND(&alloc_lock); | |||
#endif | |||
do { | |||
/* if (!memory[position].used) { */ | |||
/* blas_lock(&memory[position].lock);*/ | |||
#if defined(USE_OPENMP) | |||
if (!memory[position].used) { | |||
blas_lock(&memory[position].lock); | |||
#endif | |||
if (!memory[position].used) goto allocation; | |||
/* blas_unlock(&memory[position].lock);*/ | |||
/* } */ | |||
#if defined(USE_OPENMP) | |||
blas_unlock(&memory[position].lock); | |||
} | |||
#endif | |||
position ++; | |||
} while (position < NUM_BUFFERS); | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#endif | |||
goto error; | |||
allocation : | |||
@@ -2642,10 +2730,11 @@ void *blas_memory_alloc(int procpos){ | |||
#endif | |||
memory[position].used = 1; | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
/* blas_unlock(&memory[position].lock);*/ | |||
#else | |||
blas_unlock(&memory[position].lock); | |||
#endif | |||
if (!memory[position].addr) { | |||
do { | |||
#ifdef DEBUG | |||
@@ -2690,9 +2779,13 @@ void *blas_memory_alloc(int procpos){ | |||
} while ((BLASLONG)map_address == -1); | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
LOCK_COMMAND(&alloc_lock); | |||
#endif | |||
memory[position].addr = map_address; | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#endif | |||
#ifdef DEBUG | |||
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); | |||
@@ -2746,8 +2839,9 @@ void blas_memory_free(void *free_area){ | |||
#endif | |||
position = 0; | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
LOCK_COMMAND(&alloc_lock); | |||
#endif | |||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | |||
position++; | |||
@@ -2761,7 +2855,9 @@ void blas_memory_free(void *free_area){ | |||
WMB; | |||
memory[position].used = 0; | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#endif | |||
#ifdef DEBUG | |||
printf("Unmap Succeeded.\n\n"); | |||
@@ -2776,8 +2872,9 @@ void blas_memory_free(void *free_area){ | |||
for (position = 0; position < NUM_BUFFERS; position++) | |||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); | |||
#endif | |||
#if defined(SMP) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#endif | |||
return; | |||
} | |||
@@ -35,12 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include <string.h> | |||
#if defined(_WIN32) && defined(_MSC_VER) | |||
#if _MSC_VER < 1900 | |||
#define snprintf _snprintf | |||
#endif | |||
#endif | |||
static char* openblas_config_str="" | |||
"OpenBLAS " | |||
VERSION | |||
@@ -141,6 +141,14 @@ else | |||
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed | |||
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c | |||
endif | |||
ifeq ($(F_COMPILER), INTEL) | |||
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
-Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
else | |||
ifneq ($(C_COMPILER), LSB) | |||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
-Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
@@ -152,6 +160,7 @@ else | |||
-Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
endif | |||
endif | |||
rm -f linktest | |||
@@ -40,15 +40,25 @@ | |||
void gotoblas_init(void); | |||
void gotoblas_quit(void); | |||
#if defined(SMP) && defined(USE_TLS) | |||
void blas_thread_memory_cleanup(void); | |||
#endif | |||
BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { | |||
if (reason == DLL_PROCESS_ATTACH) { | |||
gotoblas_init(); | |||
} | |||
if (reason == DLL_PROCESS_DETACH) { | |||
gotoblas_quit(); | |||
switch(reason) { | |||
case DLL_PROCESS_ATTACH: | |||
gotoblas_init(); | |||
break; | |||
case DLL_PROCESS_DETACH: | |||
gotoblas_quit(); | |||
break; | |||
case DLL_THREAD_ATTACH: | |||
break; | |||
case DLL_THREAD_DETACH: | |||
#if defined(SMP) && defined(USE_TLS) | |||
blas_thread_memory_cleanup(); | |||
#endif | |||
break; | |||
} | |||
return TRUE; | |||
@@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include <unistd.h> | |||
#endif | |||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
#else | |||
#define NO_AVX512 | |||
#endif | |||
/* #define FORCE_P2 */ | |||
/* #define FORCE_KATMAI */ | |||
/* #define FORCE_COPPERMINE */ | |||
@@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
#ifdef FORCE_SKYLAKEX | |||
#ifdef NO_AVX512 | |||
#define FORCE | |||
#define FORCE_INTEL | |||
#define ARCHITECTURE "X86" | |||
#define SUBARCHITECTURE "HASWELL" | |||
#define ARCHCONFIG "-DHASWELL " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
"-DFMA3" | |||
#define LIBNAME "haswell" | |||
#define CORENAME "HASWELL" | |||
#else | |||
#define FORCE | |||
#define FORCE_INTEL | |||
#define ARCHITECTURE "X86" | |||
@@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define LIBNAME "skylakex" | |||
#define CORENAME "SKYLAKEX" | |||
#endif | |||
#endif | |||
#ifdef FORCE_ATOM | |||
#define FORCE | |||
@@ -618,6 +637,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CORENAME "POWER8" | |||
#endif | |||
#if defined(FORCE_POWER9) | |||
#define FORCE | |||
#define ARCHITECTURE "POWER" | |||
#define SUBARCHITECTURE "POWER9" | |||
#define SUBDIRNAME "power" | |||
#define ARCHCONFIG "-DPOWER9 " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ | |||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
#define LIBNAME "power9" | |||
#define CORENAME "POWER9" | |||
#endif | |||
#ifdef FORCE_PPCG4 | |||
#define FORCE | |||
@@ -1046,6 +1077,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#else | |||
#endif | |||
#ifdef FORCE_TSV110 | |||
#define FORCE | |||
#define ARCHITECTURE "ARM64" | |||
#define SUBARCHITECTURE "TSV110" | |||
#define SUBDIRNAME "arm64" | |||
#define ARCHCONFIG "-DTSV110 " \ | |||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ | |||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ | |||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
#define LIBNAME "tsv110" | |||
#define CORENAME "TSV110" | |||
#else | |||
#endif | |||
#ifdef FORCE_ZARCH_GENERIC | |||
#define FORCE | |||
#define ARCHITECTURE "ZARCH" | |||
@@ -1066,8 +1114,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CORENAME "Z13" | |||
#endif | |||
#ifdef FORCE_Z14 | |||
#define FORCE | |||
#define ARCHITECTURE "ZARCH" | |||
#define SUBARCHITECTURE "Z14" | |||
#define ARCHCONFIG "-DZ14 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64" | |||
#define LIBNAME "z14" | |||
#define CORENAME "Z14" | |||
#endif | |||
#ifndef FORCE | |||
#ifdef USER_TARGET | |||
#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt" | |||
#endif | |||
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | |||
defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) | |||
#ifndef POWER | |||
@@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES | |||
rotm.c rotmg.c # N.B. these do not have complex counterparts | |||
rot.c | |||
asum.c | |||
sum.c | |||
) | |||
# these will have 'z' prepended for the complex version | |||
@@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
endif () | |||
if (${float_type} STREQUAL "ZCOMPLEX") | |||
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") | |||
@@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
endif () | |||
endforeach () | |||
@@ -25,7 +25,7 @@ SBLAS1OBJS = \ | |||
saxpy.$(SUFFIX) sswap.$(SUFFIX) \ | |||
scopy.$(SUFFIX) sscal.$(SUFFIX) \ | |||
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ | |||
sasum.$(SUFFIX) snrm2.$(SUFFIX) \ | |||
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \ | |||
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ | |||
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ | |||
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ | |||
@@ -51,7 +51,7 @@ DBLAS1OBJS = \ | |||
daxpy.$(SUFFIX) dswap.$(SUFFIX) \ | |||
dcopy.$(SUFFIX) dscal.$(SUFFIX) \ | |||
ddot.$(SUFFIX) \ | |||
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ | |||
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \ | |||
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ | |||
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ | |||
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ | |||
@@ -76,7 +76,7 @@ CBLAS1OBJS = \ | |||
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ | |||
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ | |||
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ | |||
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ | |||
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \ | |||
scamax.$(SUFFIX) icamax.$(SUFFIX) \ | |||
scamin.$(SUFFIX) icamin.$(SUFFIX) \ | |||
csrot.$(SUFFIX) crotg.$(SUFFIX) \ | |||
@@ -105,7 +105,7 @@ ZBLAS1OBJS = \ | |||
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ | |||
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ | |||
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ | |||
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ | |||
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \ | |||
dzamax.$(SUFFIX) izamax.$(SUFFIX) \ | |||
dzamin.$(SUFFIX) izamin.$(SUFFIX) \ | |||
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ | |||
@@ -146,7 +146,7 @@ QBLAS1OBJS = \ | |||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | |||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | |||
qdot.$(SUFFIX) \ | |||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | |||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | |||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | |||
@@ -168,7 +168,7 @@ XBLAS1OBJS = \ | |||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | |||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | |||
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ | |||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | |||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | |||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | |||
@@ -203,7 +203,7 @@ ifdef QUAD_PRECISION | |||
QBLAS1OBJS = \ | |||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | |||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | |||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | |||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | |||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | |||
@@ -224,7 +224,7 @@ QBLAS3OBJS = \ | |||
XBLAS1OBJS = \ | |||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | |||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | |||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | |||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | |||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | |||
@@ -263,7 +263,8 @@ CSBLAS1OBJS = \ | |||
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | |||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | |||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | |||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) | |||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | |||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) | |||
CSBLAS2OBJS = \ | |||
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | |||
@@ -280,7 +281,8 @@ CDBLAS1OBJS = \ | |||
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | |||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | |||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) | |||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | |||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) | |||
CDBLAS2OBJS = \ | |||
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | |||
@@ -300,7 +302,8 @@ CCBLAS1OBJS = \ | |||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | |||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | |||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | |||
cblas_caxpby.$(SUFFIX) | |||
cblas_caxpby.$(SUFFIX) \ | |||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) | |||
CCBLAS2OBJS = \ | |||
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | |||
@@ -326,7 +329,9 @@ CZBLAS1OBJS = \ | |||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | |||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | |||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | |||
cblas_zaxpby.$(SUFFIX) | |||
cblas_zaxpby.$(SUFFIX) \ | |||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) | |||
CZBLAS2OBJS = \ | |||
cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ | |||
@@ -560,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c | |||
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c | |||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||
ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c | |||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||
dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c | |||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||
qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c | |||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||
scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c | |||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||
dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c | |||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||
qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c | |||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c | |||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||
@@ -1383,6 +1406,18 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c | |||
cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c | |||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||
cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c | |||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | |||
cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c | |||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | |||
cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c | |||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c | |||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
@@ -1395,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c | |||
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
@@ -1402,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c | |||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
@@ -0,0 +1,97 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#ifdef FUNCTION_PROFILE | |||
#include "functable.h" | |||
#endif | |||
#ifndef CBLAS | |||
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
BLASLONG n = *N; | |||
BLASLONG incx = *INCX; | |||
FLOATRET ret; | |||
PRINT_DEBUG_NAME; | |||
if (n <= 0) return 0; | |||
IDEBUG_START; | |||
FUNCTION_PROFILE_START(); | |||
ret = (FLOATRET)SUM_K(n, x, incx); | |||
FUNCTION_PROFILE_END(COMPSIZE, n, n); | |||
IDEBUG_END; | |||
return ret; | |||
} | |||
#else | |||
#ifdef COMPLEX | |||
FLOAT CNAME(blasint n, void *vx, blasint incx){ | |||
FLOAT *x = (FLOAT*) vx; | |||
#else | |||
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
#endif | |||
FLOAT ret; | |||
PRINT_DEBUG_CNAME; | |||
if (n <= 0) return 0; | |||
IDEBUG_START; | |||
FUNCTION_PROFILE_START(); | |||
ret = SUM_K(n, x, incx); | |||
FUNCTION_PROFILE_END(COMPSIZE, n, n); | |||
IDEBUG_END; | |||
return ret; | |||
} | |||
#endif |
@@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
buffer = (FLOAT *)blas_memory_alloc(1); | |||
#ifdef SMP | |||
/* nthreads = num_cpu_avail(2); | |||
nthreads = num_cpu_avail(2); | |||
FIXME trmv_thread was found to be broken, see issue 1332 */ | |||
nthreads = 1; | |||
if (nthreads == 1) { | |||
#endif | |||
@@ -81,6 +81,12 @@ | |||
#endif | |||
#endif | |||
#ifndef COMPLEX | |||
#define SMP_FACTOR 256 | |||
#else | |||
#define SMP_FACTOR 128 | |||
#endif | |||
static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | |||
#ifndef TRMM | |||
TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, | |||
@@ -198,7 +204,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, | |||
if (side < 0) info = 1; | |||
if (info != 0) { | |||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)-1); | |||
return; | |||
} | |||
@@ -366,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order, | |||
mode |= (trans << BLAS_TRANSA_SHIFT); | |||
mode |= (side << BLAS_RSIDE_SHIFT); | |||
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||
/* | |||
if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD ) | |||
args.nthreads = 1; | |||
else | |||
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) | |||
args.nthreads = 1; | |||
*/ | |||
if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD) | |||
args.nthreads = 1; | |||
else | |||
args.nthreads = num_cpu_avail(3); | |||
@@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
} else | |||
nthreads = 1; | |||
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/ | |||
nthreads = 1; | |||
if(nthreads > 1) { | |||
buffer_size = n > 16 ? 0 : n * 4 + 40; | |||
} | |||
@@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type}) | |||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) | |||
@@ -340,6 +340,32 @@ ifndef XSCALKERNEL | |||
XSCALKERNEL = zscal.S | |||
endif | |||
### SUM ### | |||
ifndef SSUMKERNEL | |||
SSUMKERNEL = sum.S | |||
endif | |||
ifndef DSUMKERNEL | |||
DSUMKERNEL = sum.S | |||
endif | |||
ifndef CSUMKERNEL | |||
CSUMKERNEL = zsum.S | |||
endif | |||
ifndef ZSUMKERNEL | |||
ZSUMKERNEL = zsum.S | |||
endif | |||
ifndef QSUMKERNEL | |||
QSUMKERNEL = sum.S | |||
endif | |||
ifndef XSUMKERNEL | |||
XSUMKERNEL = zsum.S | |||
endif | |||
### SWAP ### | |||
ifndef SSWAPKERNEL | |||
@@ -453,7 +479,7 @@ endif | |||
SBLASOBJS += \ | |||
samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ | |||
isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ | |||
sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | |||
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | |||
saxpby_k$(TSUFFIX).$(SUFFIX) | |||
@@ -463,31 +489,32 @@ DBLASOBJS += \ | |||
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | |||
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | |||
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | |||
daxpby_k$(TSUFFIX).$(SUFFIX) | |||
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||
QBLASOBJS += \ | |||
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | |||
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | |||
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | |||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) | |||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | |||
qsum_k$(TSUFFIX).$(SUFFIX) | |||
CBLASOBJS += \ | |||
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | |||
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ | |||
cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ | |||
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) | |||
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX) | |||
ZBLASOBJS += \ | |||
zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ | |||
zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ | |||
zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ | |||
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) | |||
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX) | |||
XBLASOBJS += \ | |||
xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ | |||
xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ | |||
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ | |||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) | |||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) | |||
### AMAX ### | |||
@@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||
$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ | |||
### ASUM ### | |||
$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||
@@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||
$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) | |||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | |||
### SUM ### | |||
$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||
$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ | |||
$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL) | |||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ | |||
$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL) | |||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ | |||
$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL) | |||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | |||
### AXPY ### | |||
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||
@@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B) | |||
USE_TRMM = 1 | |||
endif | |||
ifeq ($(TARGET), GENERIC) | |||
ifeq ($(CORE), GENERIC) | |||
USE_TRMM = 1 | |||
endif | |||
@@ -44,10 +44,18 @@ ifeq ($(CORE), POWER8) | |||
USE_TRMM = 1 | |||
endif | |||
ifeq ($(CORE), POWER9) | |||
USE_TRMM = 1 | |||
endif | |||
ifeq ($(ARCH), zarch) | |||
USE_TRMM = 1 | |||
endif | |||
ifeq ($(CORE), Z14) | |||
USE_TRMM = 1 | |||
endif | |||
@@ -0,0 +1,206 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "version.h" | |||
#define PREFETCHSIZE 88 | |||
#define N $16 | |||
#define X $17 | |||
#define INCX $18 | |||
#define I $19 | |||
#define s0 $f0 | |||
#define s1 $f1 | |||
#define s2 $f10 | |||
#define s3 $f11 | |||
#define a0 $f12 | |||
#define a1 $f13 | |||
#define a2 $f14 | |||
#define a3 $f15 | |||
#define a4 $f16 | |||
#define a5 $f17 | |||
#define a6 $f18 | |||
#define a7 $f19 | |||
#define t0 $f20 | |||
#define t1 $f21 | |||
#define t2 $f22 | |||
#define t3 $f23 | |||
PROLOGUE | |||
PROFCODE | |||
fclr s0 | |||
unop | |||
fclr t0 | |||
ble N, $L999 | |||
sra N, 3, I | |||
fclr s1 | |||
fclr s2 | |||
ble I, $L15 | |||
LD a0, 0 * SIZE(X) | |||
fclr t1 | |||
SXADDQ INCX, X, X | |||
fclr t2 | |||
LD a1, 0 * SIZE(X) | |||
fclr t3 | |||
SXADDQ INCX, X, X | |||
fclr s3 | |||
LD a2, 0 * SIZE(X) | |||
SXADDQ INCX, X, X | |||
LD a3, 0 * SIZE(X) | |||
SXADDQ INCX, X, X | |||
LD a4, 0 * SIZE(X) | |||
SXADDQ INCX, X, X | |||
LD a5, 0 * SIZE(X) | |||
SXADDQ INCX, X, X | |||
lda I, -1(I) | |||
ble I, $L13 | |||
.align 4 | |||
$L12: | |||
ADD s0, t0, s0 | |||
ldl $31, PREFETCHSIZE * 2 * SIZE(X) | |||
fmov a0, t0 | |||
lda I, -1(I) | |||
ADD s1, t1, s1 | |||
LD a6, 0 * SIZE(X) | |||
fmov a1, t1 | |||
SXADDQ INCX, X, X | |||
ADD s2, t2, s2 | |||
LD a7, 0 * SIZE(X) | |||
fmov a2, t2 | |||
SXADDQ INCX, X, X | |||
ADD s3, t3, s3 | |||
LD a0, 0 * SIZE(X) | |||
fmov a3, t3 | |||
SXADDQ INCX, X, X | |||
ADD s0, t0, s0 | |||
LD a1, 0 * SIZE(X) | |||
fmov a4, t0 | |||
SXADDQ INCX, X, X | |||
ADD s1, t1, s1 | |||
LD a2, 0 * SIZE(X) | |||
fmov a5, t1 | |||
SXADDQ INCX, X, X | |||
ADD s2, t2, s2 | |||
LD a3, 0 * SIZE(X) | |||
fmov a6, t2 | |||
SXADDQ INCX, X, X | |||
ADD s3, t3, s3 | |||
LD a4, 0 * SIZE(X) | |||
fmov a7, t3 | |||
SXADDQ INCX, X, X | |||
LD a5, 0 * SIZE(X) | |||
unop | |||
SXADDQ INCX, X, X | |||
bne I, $L12 | |||
.align 4 | |||
$L13: | |||
ADD s0, t0, s0 | |||
LD a6, 0 * SIZE(X) | |||
fmov a0, t0 | |||
SXADDQ INCX, X, X | |||
ADD s1, t1, s1 | |||
LD a7, 0 * SIZE(X) | |||
fmov a1, t1 | |||
SXADDQ INCX, X, X | |||
ADD s2, t2, s2 | |||
fmov a2, t2 | |||
ADD s3, t3, s3 | |||
fmov a3, t3 | |||
ADD s0, t0, s0 | |||
fmov a4, t0 | |||
ADD s1, t1, s1 | |||
fmov a5, t1 | |||
ADD s2, t2, s2 | |||
fmov a6, t2 | |||
ADD s3, t3, s3 | |||
fmov a7, t3 | |||
ADD s1, t1, s1 | |||
ADD s2, t2, s2 | |||
ADD s3, t3, s3 | |||
ADD s0, s1, s0 | |||
ADD s2, s3, s2 | |||
.align 4 | |||
$L15: | |||
and N, 7, I | |||
ADD s0, s2, s0 | |||
unop | |||
ble I, $L999 | |||
.align 4 | |||
$L17: | |||
ADD s0, t0, s0 | |||
LD a0, 0 * SIZE(X) | |||
SXADDQ INCX, X, X | |||
fmov a0, t0 | |||
lda I, -1(I) | |||
bne I, $L17 | |||
.align 4 | |||
$L999: | |||
ADD s0, t0, s0 | |||
ret | |||
EPILOGUE |
@@ -0,0 +1,208 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "version.h" | |||
#define PREFETCHSIZE 88 | |||
#define N $16 | |||
#define X $17 | |||
#define INCX $18 | |||
#define I $19 | |||
#define s0 $f0 | |||
#define s1 $f1 | |||
#define s2 $f10 | |||
#define s3 $f11 | |||
#define a0 $f12 | |||
#define a1 $f13 | |||
#define a2 $f14 | |||
#define a3 $f15 | |||
#define a4 $f16 | |||
#define a5 $f17 | |||
#define a6 $f18 | |||
#define a7 $f19 | |||
#define t0 $f20 | |||
#define t1 $f21 | |||
#define t2 $f22 | |||
#define t3 $f23 | |||
PROLOGUE | |||
PROFCODE | |||
fclr s0 | |||
unop | |||
fclr t0 | |||
addq INCX, INCX, INCX | |||
fclr s1 | |||
unop | |||
fclr t1 | |||
ble N, $L999 | |||
fclr s2 | |||
sra N, 2, I | |||
fclr s3 | |||
ble I, $L15 | |||
LD a0, 0 * SIZE(X) | |||
fclr t2 | |||
LD a1, 1 * SIZE(X) | |||
SXADDQ INCX, X, X | |||
LD a2, 0 * SIZE(X) | |||
fclr t3 | |||
LD a3, 1 * SIZE(X) | |||
SXADDQ INCX, X, X | |||
LD a4, 0 * SIZE(X) | |||
LD a5, 1 * SIZE(X) | |||
SXADDQ INCX, X, X | |||
lda I, -1(I) | |||
ble I, $L13 | |||
.align 4 | |||
$L12: | |||
ADD s0, t0, s0 | |||
ldl $31, PREFETCHSIZE * SIZE(X) | |||
fmov a0, t0 | |||
lda I, -1(I) | |||
ADD s1, t1, s1 | |||
LD a6, 0 * SIZE(X) | |||
fmov a1, t1 | |||
unop | |||
ADD s2, t2, s2 | |||
LD a7, 1 * SIZE(X) | |||
fmov a2, t2 | |||
SXADDQ INCX, X, X | |||
ADD s3, t3, s3 | |||
LD a0, 0 * SIZE(X) | |||
fmov a3, t3 | |||
unop | |||
ADD s0, t0, s0 | |||
LD a1, 1 * SIZE(X) | |||
fmov a4, t0 | |||
SXADDQ INCX, X, X | |||
ADD s1, t1, s1 | |||
LD a2, 0 * SIZE(X) | |||
fmov a5, t1 | |||
unop | |||
ADD s2, t2, s2 | |||
LD a3, 1 * SIZE(X) | |||
fmov a6, t2 | |||
SXADDQ INCX, X, X | |||
ADD s3, t3, s3 | |||
LD a4, 0 * SIZE(X) | |||
fmov a7, t3 | |||
unop | |||
LD a5, 1 * SIZE(X) | |||
unop | |||
SXADDQ INCX, X, X | |||
bne I, $L12 | |||
.align 4 | |||
$L13: | |||
ADD s0, t0, s0 | |||
LD a6, 0 * SIZE(X) | |||
fmov a0, t0 | |||
ADD s1, t1, s1 | |||
LD a7, 1 * SIZE(X) | |||
fmov a1, t1 | |||
SXADDQ INCX, X, X | |||
ADD s2, t2, s2 | |||
fmov a2, t2 | |||
ADD s3, t3, s3 | |||
fmov a3, t3 | |||
ADD s0, t0, s0 | |||
fmov a4, t0 | |||
ADD s1, t1, s1 | |||
fmov a5, t1 | |||
ADD s2, t2, s2 | |||
fmov a6, t2 | |||
ADD s3, t3, s3 | |||
fmov a7, t3 | |||
ADD s2, t2, s2 | |||
ADD s3, t3, s3 | |||
.align 4 | |||
$L15: | |||
ADD s0, s2, s0 | |||
and N, 3, I | |||
ADD s1, s3, s1 | |||
ble I, $L999 | |||
.align 4 | |||
$L17: | |||
ADD s0, t0, s0 | |||
LD a0, 0 * SIZE(X) | |||
fmov a0, t0 | |||
lda I, -1(I) | |||
ADD s1, t1, s1 | |||
LD a1, 1 * SIZE(X) | |||
fmov a1, t1 | |||
SXADDQ INCX, X, X | |||
bne I, $L17 | |||
.align 4 | |||
$L999: | |||
ADD s0, t0, s0 | |||
ADD s1, t1, s1 | |||
ADD s0, s1, s0 | |||
ret | |||
EPILOGUE |
@@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c | |||
CASUMKERNEL = ../arm/zasum.c | |||
ZASUMKERNEL = ../arm/zasum.c | |||
SSUMKERNEL = ../arm/sum.c | |||
DSUMKERNEL = ../arm/sum.c | |||
CSUMKERNEL = ../arm/zsum.c | |||
ZSUMKERNEL = ../arm/zsum.c | |||
SAXPYKERNEL = ../arm/axpy.c | |||
DAXPYKERNEL = ../arm/axpy.c | |||
CAXPYKERNEL = ../arm/zaxpy.c | |||
@@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S | |||
CASUMKERNEL = asum_vfp.S | |||
ZASUMKERNEL = asum_vfp.S | |||
SSUMKERNEL = sum_vfp.S | |||
DSUMKERNEL = sum_vfp.S | |||
SAXPYKERNEL = axpy_vfp.S | |||
DAXPYKERNEL = axpy_vfp.S | |||
CAXPYKERNEL = axpy_vfp.S | |||
@@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
while(i < n) | |||
{ | |||
if( x[ix] > minf ) | |||
if( x[ix] < minf ) | |||
{ | |||
min = i; | |||
minf = x[ix]; | |||
@@ -0,0 +1,51 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
/************************************************************************************** | |||
* trivial copy of asum.c with the ABS() removed * | |||
**************************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i=0; | |||
FLOAT sumf = 0.0; | |||
if (n <= 0 || inc_x <= 0) return(sumf); | |||
n *= inc_x; | |||
while(i < n) | |||
{ | |||
sumf += x[i]; | |||
i += inc_x; | |||
} | |||
return(sumf); | |||
} | |||
@@ -0,0 +1,425 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
/************************************************************************************** | |||
* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed * | |||
**************************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define STACKSIZE 256 | |||
#define N r0 | |||
#define X r1 | |||
#define INC_X r2 | |||
#define I r12 | |||
#define X_PRE 512 | |||
/************************************************************************************** | |||
* Macro definitions | |||
**************************************************************************************/ | |||
#if !defined(COMPLEX) | |||
#if defined(DOUBLE) | |||
.macro KERNEL_F4 | |||
pld [ X, #X_PRE ] | |||
vldmia.f64 X!, { d4 - d5 } | |||
vadd.f64 d0 , d0, d4 | |||
vldmia.f64 X!, { d6 - d7 } | |||
vadd.f64 d1 , d1, d5 | |||
vadd.f64 d0 , d0, d6 | |||
vadd.f64 d1 , d1, d7 | |||
.endm | |||
.macro KERNEL_F1 | |||
vldmia.f64 X!, { d4 } | |||
vadd.f64 d0 , d0, d4 | |||
.endm | |||
.macro KERNEL_S4 | |||
vldmia.f64 X, { d4 } | |||
vadd.f64 d0 , d0, d4 | |||
add X, X, INC_X | |||
vldmia.f64 X, { d4 } | |||
vadd.f64 d0 , d0, d4 | |||
add X, X, INC_X | |||
vldmia.f64 X, { d4 } | |||
vadd.f64 d0 , d0, d4 | |||
add X, X, INC_X | |||
vldmia.f64 X, { d4 } | |||
vadd.f64 d0 , d0, d4 | |||
add X, X, INC_X | |||
.endm | |||
.macro KERNEL_S1 | |||
vldmia.f64 X, { d4 } | |||
vadd.f64 d0 , d0, d4 | |||
add X, X, INC_X | |||
.endm | |||
#else | |||
.macro KERNEL_F4 | |||
vldmia.f32 X!, { s4 - s5 } | |||
vadd.f32 s0 , s0, s4 | |||
vldmia.f32 X!, { s6 - s7 } | |||
vadd.f32 s1 , s1, s5 | |||
vadd.f32 s0 , s0, s6 | |||
vadd.f32 s1 , s1, s7 | |||
.endm | |||
.macro KERNEL_F1 | |||
vldmia.f32 X!, { s4 } | |||
vadd.f32 s0 , s0, s4 | |||
.endm | |||
.macro KERNEL_S4 | |||
vldmia.f32 X, { s4 } | |||
vadd.f32 s0 , s0, s4 | |||
add X, X, INC_X | |||
vldmia.f32 X, { s4 } | |||
vadd.f32 s0 , s0, s4 | |||
add X, X, INC_X | |||
vldmia.f32 X, { s4 } | |||
vadd.f32 s0 , s0, s4 | |||
add X, X, INC_X | |||
vldmia.f32 X, { s4 } | |||
vadd.f32 s0 , s0, s4 | |||
add X, X, INC_X | |||
.endm | |||
.macro KERNEL_S1 | |||
vldmia.f32 X, { s4 } | |||
vadd.f32 s0 , s0, s4 | |||
add X, X, INC_X | |||
.endm | |||
#endif | |||
#else | |||
#if defined(DOUBLE) | |||
.macro KERNEL_F4 | |||
pld [ X, #X_PRE ] | |||
vldmia.f64 X!, { d4 - d5 } | |||
vadd.f64 d0 , d0, d4 | |||
vldmia.f64 X!, { d6 - d7 } | |||
vadd.f64 d1 , d1, d5 | |||
vadd.f64 d0 , d0, d6 | |||
vadd.f64 d1 , d1, d7 | |||
pld [ X, #X_PRE ] | |||
vldmia.f64 X!, { d4 - d5 } | |||
vadd.f64 d0 , d0, d4 | |||
vldmia.f64 X!, { d6 - d7 } | |||
vadd.f64 d1 , d1, d5 | |||
vadd.f64 d0 , d0, d6 | |||
vadd.f64 d1 , d1, d7 | |||
.endm | |||
.macro KERNEL_F1 | |||
vldmia.f64 X!, { d4 } | |||
vadd.f64 d0 , d0, d4 | |||
vldmia.f64 X!, { d4 } | |||
vadd.f64 d0 , d0, d4 | |||
.endm | |||
.macro KERNEL_S4 | |||
vldmia.f64 X, { d4 -d5 } | |||
vadd.f64 d0 , d0, d4 | |||
vadd.f64 d0 , d0, d5 | |||
add X, X, INC_X | |||
vldmia.f64 X, { d4 -d5 } | |||
vadd.f64 d0 , d0, d4 | |||
vadd.f64 d0 , d0, d5 | |||
add X, X, INC_X | |||
vldmia.f64 X, { d4 -d5 } | |||
vadd.f64 d0 , d0, d4 | |||
vadd.f64 d0 , d0, d5 | |||
add X, X, INC_X | |||
vldmia.f64 X, { d4 -d5 } | |||
vadd.f64 d0 , d0, d4 | |||
vadd.f64 d0 , d0, d5 | |||
add X, X, INC_X | |||
.endm | |||
.macro KERNEL_S1 | |||
vldmia.f64 X, { d4 -d5 } | |||
vadd.f64 d0 , d0, d4 | |||
vadd.f64 d0 , d0, d5 | |||
add X, X, INC_X | |||
.endm | |||
#else | |||
.macro KERNEL_F4 | |||
pld [ X, #X_PRE ] | |||
vldmia.f32 X!, { s4 - s5 } | |||
vadd.f32 s0 , s0, s4 | |||
vldmia.f32 X!, { s6 - s7 } | |||
vadd.f32 s1 , s1, s5 | |||
vadd.f32 s0 , s0, s6 | |||
vadd.f32 s1 , s1, s7 | |||
vldmia.f32 X!, { s4 - s5 } | |||
vadd.f32 s0 , s0, s4 | |||
vldmia.f32 X!, { s6 - s7 } | |||
vadd.f32 s1 , s1, s5 | |||
vadd.f32 s0 , s0, s6 | |||
vadd.f32 s1 , s1, s7 | |||
.endm | |||
.macro KERNEL_F1 | |||
vldmia.f32 X!, { s4 } | |||
vadd.f32 s0 , s0, s4 | |||
vldmia.f32 X!, { s4 } | |||
vadd.f32 s0 , s0, s4 | |||
.endm | |||
.macro KERNEL_S4 | |||
vldmia.f32 X, { s4 -s5 } | |||
vadd.f32 s0 , s0, s4 | |||
vadd.f32 s0 , s0, s5 | |||
add X, X, INC_X | |||
vldmia.f32 X, { s4 -s5 } | |||
vadd.f32 s0 , s0, s4 | |||
vadd.f32 s0 , s0, s5 | |||
add X, X, INC_X | |||
vldmia.f32 X, { s4 -s5 } | |||
vadd.f32 s0 , s0, s4 | |||
vadd.f32 s0 , s0, s5 | |||
add X, X, INC_X | |||
vldmia.f32 X, { s4 -s5 } | |||
vadd.f32 s0 , s0, s4 | |||
vadd.f32 s0 , s0, s5 | |||
add X, X, INC_X | |||
.endm | |||
.macro KERNEL_S1 | |||
vldmia.f32 X, { s4 -s5 } | |||
vadd.f32 s0 , s0, s4 | |||
vadd.f32 s0 , s0, s5 | |||
add X, X, INC_X | |||
.endm | |||
#endif | |||
#endif | |||
/************************************************************************************** | |||
* End of macro definitions | |||
**************************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
movs r12, #0 // clear floating point register | |||
vmov s0, r12 | |||
vmov s1, r12 | |||
#if defined(DOUBLE) | |||
vcvt.f64.f32 d0, s0 | |||
vcvt.f64.f32 d1, s1 | |||
#endif | |||
cmp N, #0 | |||
ble asum_kernel_L999 | |||
cmp INC_X, #0 | |||
beq asum_kernel_L999 | |||
cmp INC_X, #1 | |||
bne asum_kernel_S_BEGIN | |||
asum_kernel_F_BEGIN: | |||
asrs I, N, #2 // I = N / 4 | |||
ble asum_kernel_F1 | |||
.align 5 | |||
asum_kernel_F4: | |||
#if !defined(DOUBLE) && !defined(COMPLEX) | |||
pld [ X, #X_PRE ] | |||
#endif | |||
KERNEL_F4 | |||
subs I, I, #1 | |||
ble asum_kernel_F1 | |||
KERNEL_F4 | |||
subs I, I, #1 | |||
bne asum_kernel_F4 | |||
asum_kernel_F1: | |||
ands I, N, #3 | |||
ble asum_kernel_L999 | |||
asum_kernel_F10: | |||
KERNEL_F1 | |||
subs I, I, #1 | |||
bne asum_kernel_F10 | |||
b asum_kernel_L999 | |||
asum_kernel_S_BEGIN: | |||
#if defined(COMPLEX) | |||
#if defined(DOUBLE) | |||
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||
#else | |||
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||
#endif | |||
#else | |||
#if defined(DOUBLE) | |||
lsl INC_X, INC_X, #3 // INC_X * SIZE | |||
#else | |||
lsl INC_X, INC_X, #2 // INC_X * SIZE | |||
#endif | |||
#endif | |||
asrs I, N, #2 // I = N / 4 | |||
ble asum_kernel_S1 | |||
.align 5 | |||
asum_kernel_S4: | |||
KERNEL_S4 | |||
subs I, I, #1 | |||
bne asum_kernel_S4 | |||
asum_kernel_S1: | |||
ands I, N, #3 | |||
ble asum_kernel_L999 | |||
asum_kernel_S10: | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne asum_kernel_S10 | |||
asum_kernel_L999: | |||
#if defined(DOUBLE) | |||
vadd.f64 d0 , d0, d1 // set return value | |||
#else | |||
vadd.f32 s0 , s0, s1 // set return value | |||
#endif | |||
#if !defined(__ARM_PCS_VFP) | |||
#if !defined(DOUBLE) | |||
vmov r0, s0 | |||
#else | |||
vmov r0, r1, d0 | |||
#endif | |||
#endif | |||
bx lr | |||
EPILOGUE | |||
@@ -0,0 +1,57 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
/************************************************************************************** | |||
* trivial copy of zasum.c with the ABS() removed * | |||
**************************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#define CSUM1(x,i) x[i]+x[i+1] | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i=0; | |||
FLOAT sumf = 0.0; | |||
BLASLONG inc_x2; | |||
if (n <= 0 || inc_x <= 0) return(sumf); | |||
inc_x2 = 2 * inc_x; | |||
n *= inc_x2; | |||
while(i < n) | |||
{ | |||
sumf += CSUM1(x,i); | |||
i += inc_x2; | |||
} | |||
return(sumf); | |||
} | |||
@@ -0,0 +1,175 @@ | |||
SAMINKERNEL = ../arm/amin.c | |||
DAMINKERNEL = ../arm/amin.c | |||
CAMINKERNEL = ../arm/zamin.c | |||
ZAMINKERNEL = ../arm/zamin.c | |||
SMAXKERNEL = ../arm/max.c | |||
DMAXKERNEL = ../arm/max.c | |||
SMINKERNEL = ../arm/min.c | |||
DMINKERNEL = ../arm/min.c | |||
ISAMINKERNEL = ../arm/iamin.c | |||
IDAMINKERNEL = ../arm/iamin.c | |||
ICAMINKERNEL = ../arm/izamin.c | |||
IZAMINKERNEL = ../arm/izamin.c | |||
ISMAXKERNEL = ../arm/imax.c | |||
IDMAXKERNEL = ../arm/imax.c | |||
ISMINKERNEL = ../arm/imin.c | |||
IDMINKERNEL = ../arm/imin.c | |||
STRMMKERNEL = ../generic/trmmkernel_4x4.c | |||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
SAMAXKERNEL = amax.S | |||
DAMAXKERNEL = amax.S | |||
CAMAXKERNEL = zamax.S | |||
ZAMAXKERNEL = zamax.S | |||
ISAMAXKERNEL = iamax.S | |||
IDAMAXKERNEL = iamax.S | |||
ICAMAXKERNEL = izamax.S | |||
IZAMAXKERNEL = izamax.S | |||
SASUMKERNEL = asum.S | |||
DASUMKERNEL = asum.S | |||
CASUMKERNEL = casum.S | |||
ZASUMKERNEL = zasum.S | |||
SAXPYKERNEL = axpy.S | |||
DAXPYKERNEL = axpy.S | |||
CAXPYKERNEL = zaxpy.S | |||
ZAXPYKERNEL = zaxpy.S | |||
SCOPYKERNEL = copy.S | |||
DCOPYKERNEL = copy.S | |||
CCOPYKERNEL = copy.S | |||
ZCOPYKERNEL = copy.S | |||
SDOTKERNEL = dot.S | |||
DDOTKERNEL = dot.S | |||
CDOTKERNEL = zdot.S | |||
ZDOTKERNEL = zdot.S | |||
DSDOTKERNEL = dot.S | |||
SNRM2KERNEL = nrm2.S | |||
DNRM2KERNEL = nrm2.S | |||
CNRM2KERNEL = znrm2.S | |||
ZNRM2KERNEL = znrm2.S | |||
SROTKERNEL = rot.S | |||
DROTKERNEL = rot.S | |||
CROTKERNEL = zrot.S | |||
ZROTKERNEL = zrot.S | |||
SSCALKERNEL = scal.S | |||
DSCALKERNEL = scal.S | |||
CSCALKERNEL = zscal.S | |||
ZSCALKERNEL = zscal.S | |||
SSWAPKERNEL = swap.S | |||
DSWAPKERNEL = swap.S | |||
CSWAPKERNEL = swap.S | |||
ZSWAPKERNEL = swap.S | |||
SGEMVNKERNEL = gemv_n.S | |||
DGEMVNKERNEL = gemv_n.S | |||
CGEMVNKERNEL = zgemv_n.S | |||
ZGEMVNKERNEL = zgemv_n.S | |||
SGEMVTKERNEL = gemv_t.S | |||
DGEMVTKERNEL = gemv_t.S | |||
CGEMVTKERNEL = zgemv_t.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
ifeq ($(DGEMM_UNROLL_M), 8) | |||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
else | |||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
endif | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
ifeq ($(DGEMM_UNROLL_N), 4) | |||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
else | |||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
endif | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
@@ -0,0 +1,164 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define N x0 /* vector length */ | |||
#define X x1 /* X vector address */ | |||
#define INC_X x2 /* X stride */ | |||
#define I x5 /* loop variable */ | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
#define REG0 wzr | |||
#define SUMF s0 | |||
#define TMPF s1 | |||
#define TMPVF {v1.s}[0] | |||
#define SZ 4 | |||
/******************************************************************************/ | |||
.macro KERNEL_F1 | |||
ld1 {v1.2s}, [X], #8 | |||
ext v2.8b, v1.8b, v1.8b, #4 | |||
fadd TMPF, TMPF, s2 | |||
fadd SUMF, SUMF, TMPF | |||
.endm | |||
.macro KERNEL_F8 | |||
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] | |||
add X, X, #64 | |||
PRFM PLDL1KEEP, [X, #1024] | |||
fadd v1.4s, v1.4s, v2.4s | |||
fadd v3.4s, v3.4s, v4.4s | |||
fadd v0.4s, v0.4s, v1.4s | |||
fadd v0.4s, v0.4s, v3.4s | |||
.endm | |||
.macro KERNEL_F8_FINALIZE | |||
ext v1.16b, v0.16b, v0.16b, #8 | |||
fadd v0.2s, v0.2s, v1.2s | |||
faddp SUMF, v0.2s | |||
.endm | |||
.macro INIT_S | |||
lsl INC_X, INC_X, #3 | |||
.endm | |||
.macro KERNEL_S1 | |||
ld1 {v1.2s}, [X], INC_X | |||
ext v2.8b, v1.8b, v1.8b, #4 | |||
fadd TMPF, TMPF, s2 | |||
fadd SUMF, SUMF, TMPF | |||
.endm | |||
/******************************************************************************* | |||
* End of macro definitions | |||
*******************************************************************************/ | |||
PROLOGUE | |||
fmov SUMF, REG0 | |||
fmov s1, SUMF | |||
cmp N, xzr | |||
ble .Lcsum_kernel_L999 | |||
cmp INC_X, xzr | |||
ble .Lcsum_kernel_L999 | |||
cmp INC_X, #1 | |||
bne .Lcsum_kernel_S_BEGIN | |||
.Lcsum_kernel_F_BEGIN: | |||
asr I, N, #3 | |||
cmp I, xzr | |||
beq .Lcsum_kernel_F1 | |||
.Lcsum_kernel_F8: | |||
KERNEL_F8 | |||
subs I, I, #1 | |||
bne .Lcsum_kernel_F8 | |||
KERNEL_F8_FINALIZE | |||
.Lcsum_kernel_F1: | |||
ands I, N, #7 | |||
ble .Lcsum_kernel_L999 | |||
.Lcsum_kernel_F10: | |||
KERNEL_F1 | |||
subs I, I, #1 | |||
bne .Lcsum_kernel_F10 | |||
.Lcsum_kernel_L999: | |||
ret | |||
.Lcsum_kernel_S_BEGIN: | |||
INIT_S | |||
asr I, N, #2 | |||
cmp I, xzr | |||
ble .Lcsum_kernel_S1 | |||
.Lcsum_kernel_S4: | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne .Lcsum_kernel_S4 | |||
.Lcsum_kernel_S1: | |||
ands I, N, #3 | |||
ble .Lcsum_kernel_L999 | |||
.Lcsum_kernel_S10: | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne .Lcsum_kernel_S10 | |||
ret | |||
EPILOGUE |
@@ -0,0 +1,186 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define N x0 /* vector length */ | |||
#define X x1 /* X vector address */ | |||
#define INC_X x2 /* X stride */ | |||
#define I x5 /* loop variable */ | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
#if !defined(DOUBLE) | |||
#define REG0 wzr | |||
#define SUMF s0 | |||
#define TMPF s1 | |||
#define TMPVF {v1.s}[0] | |||
#define SZ 4 | |||
#else | |||
#define REG0 xzr | |||
#define SUMF d0 | |||
#define TMPF d1 | |||
#define TMPVF {v1.d}[0] | |||
#define SZ 8 | |||
#endif | |||
/******************************************************************************/ | |||
.macro KERNEL_F1 | |||
ldr TMPF, [X], #SZ | |||
fadd SUMF, SUMF, TMPF | |||
.endm | |||
.macro KERNEL_F8 | |||
#if !defined(DOUBLE) | |||
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] | |||
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] | |||
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] | |||
PRFM PLDL1KEEP, [X, #1024] | |||
#else // DOUBLE | |||
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] | |||
add X, X, #64 | |||
PRFM PLDL1KEEP, [X, #1024] | |||
fadd v2.2d, v2.2d, v3.2d | |||
fadd v4.2d, v4.2d, v5.2d | |||
fadd v0.2d, v0.2d, v2.2d | |||
fadd v0.2d, v0.2d, v4.2d | |||
#endif | |||
.endm | |||
.macro KERNEL_F8_FINALIZE | |||
#if !defined(DOUBLE) | |||
ext v1.16b, v0.16b, v0.16b, #8 | |||
fadd v0.2s, v0.2s, v1.2s | |||
faddp SUMF, v0.2s | |||
#else | |||
faddp SUMF, v0.2d | |||
#endif | |||
.endm | |||
.macro INIT_S | |||
#if !defined(DOUBLE) | |||
lsl INC_X, INC_X, #2 | |||
#else | |||
lsl INC_X, INC_X, #3 | |||
#endif | |||
.endm | |||
.macro KERNEL_S1 | |||
ld1 TMPVF, [X], INC_X | |||
fadd SUMF, SUMF, TMPF | |||
.endm | |||
/******************************************************************************* | |||
* End of macro definitions | |||
*******************************************************************************/ | |||
PROLOGUE | |||
fmov SUMF, REG0 | |||
#if !defined(DOUBLE) | |||
fmov s1, SUMF | |||
#else | |||
fmov d1, SUMF | |||
#endif | |||
cmp N, xzr | |||
ble .Lsum_kernel_L999 | |||
cmp INC_X, xzr | |||
ble .Lsum_kernel_L999 | |||
cmp INC_X, #1 | |||
bne .Lsum_kernel_S_BEGIN | |||
.Lsum_kernel_F_BEGIN: | |||
asr I, N, #3 | |||
cmp I, xzr | |||
beq .Lsum_kernel_F1 | |||
.Lsum_kernel_F8: | |||
KERNEL_F8 | |||
subs I, I, #1 | |||
bne .Lsum_kernel_F8 | |||
KERNEL_F8_FINALIZE | |||
.Lsum_kernel_F1: | |||
ands I, N, #7 | |||
ble .Lsum_kernel_L999 | |||
.Lsum_kernel_F10: | |||
KERNEL_F1 | |||
subs I, I, #1 | |||
bne .Lsum_kernel_F10 | |||
.Lsum_kernel_L999: | |||
ret | |||
.Lsum_kernel_S_BEGIN: | |||
INIT_S | |||
asr I, N, #2 | |||
cmp I, xzr | |||
ble .Lsum_kernel_S1 | |||
.Lsum_kernel_S4: | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne .Lsum_kernel_S4 | |||
.Lsum_kernel_S1: | |||
ands I, N, #3 | |||
ble .Lsum_kernel_L999 | |||
.Lsum_kernel_S10: | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne .Lsum_kernel_S10 | |||
ret | |||
EPILOGUE |
@@ -0,0 +1,158 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define N x0 /* vector length */ | |||
#define X x1 /* X vector address */ | |||
#define INC_X x2 /* X stride */ | |||
#define I x5 /* loop variable */ | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
#define REG0 xzr | |||
#define SUMF d0 | |||
#define TMPF d1 | |||
#define TMPVF {v1.d}[0] | |||
#define SZ 8 | |||
/******************************************************************************/ | |||
.macro KERNEL_F1 | |||
ld1 {v1.2d}, [X], #16 | |||
faddp TMPF, v1.2d | |||
fadd SUMF, SUMF, TMPF | |||
.endm | |||
.macro KERNEL_F4 | |||
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 | |||
fadd v1.2d, v1.2d, v2.2d | |||
fadd v3.2d, v3.2d, v4.2d | |||
fadd v0.2d, v0.2d, v1.2d | |||
fadd v0.2d, v0.2d, v3.2d | |||
PRFM PLDL1KEEP, [X, #1024] | |||
.endm | |||
.macro KERNEL_F4_FINALIZE | |||
faddp SUMF, v0.2d | |||
.endm | |||
.macro INIT_S | |||
lsl INC_X, INC_X, #4 | |||
.endm | |||
.macro KERNEL_S1 | |||
ld1 {v1.2d}, [X], INC_X | |||
faddp TMPF, v1.2d | |||
fadd SUMF, SUMF, TMPF | |||
.endm | |||
/******************************************************************************* | |||
* End of macro definitions | |||
*******************************************************************************/ | |||
PROLOGUE | |||
fmov SUMF, REG0 | |||
cmp N, xzr | |||
ble .Lzsum_kernel_L999 | |||
cmp INC_X, xzr | |||
ble .Lzsum_kernel_L999 | |||
cmp INC_X, #1 | |||
bne .Lzsum_kernel_S_BEGIN | |||
.Lzsum_kernel_F_BEGIN: | |||
asr I, N, #2 | |||
cmp I, xzr | |||
beq .Lzsum_kernel_F1 | |||
.Lzsum_kernel_F4: | |||
KERNEL_F4 | |||
subs I, I, #1 | |||
bne .Lzsum_kernel_F4 | |||
KERNEL_F4_FINALIZE | |||
.Lzsum_kernel_F1: | |||
ands I, N, #3 | |||
ble .Lzsum_kernel_L999 | |||
.Lzsum_kernel_F10: | |||
KERNEL_F1 | |||
subs I, I, #1 | |||
bne .Lzsum_kernel_F10 | |||
.Lzsum_kernel_L999: | |||
ret | |||
.Lzsum_kernel_S_BEGIN: | |||
INIT_S | |||
asr I, N, #2 | |||
cmp I, xzr | |||
ble .Lzsum_kernel_S1 | |||
.Lzsum_kernel_S4: | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne .Lzsum_kernel_S4 | |||
.Lzsum_kernel_S1: | |||
ands I, N, #3 | |||
ble .Lzsum_kernel_L999 | |||
.Lzsum_kernel_S10: | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne .Lzsum_kernel_S10 | |||
ret | |||
EPILOGUE |
@@ -60,6 +60,10 @@ CASUMKERNEL = asum.S | |||
ZASUMKERNEL = asum.S | |||
XASUMKERNEL = asum.S | |||
CSUMKERNEL = sum.S | |||
ZSUMKERNEL = sum.S | |||
XSUMKERNEL = sum.S | |||
CNRM2KERNEL = nrm2.S | |||
ZNRM2KERNEL = nrm2.S | |||
XNRM2KERNEL = nrm2.S | |||
@@ -0,0 +1,358 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2019, The OpenBLAS project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#ifdef XDOUBLE | |||
#define PREFETCH_SIZE ( 8 * 16 + 4) | |||
#elif defined(DOUBLE) | |||
#define PREFETCH_SIZE (16 * 16 + 8) | |||
#else | |||
#define PREFETCH_SIZE (32 * 16 + 16) | |||
#endif | |||
#ifndef COMPLEX | |||
#define COMPADD 0 | |||
#define STRIDE INCX | |||
#else | |||
#define COMPADD 1 | |||
#define STRIDE SIZE | |||
#endif | |||
#define PRE1 r2 | |||
#define I r17 | |||
#define J r18 | |||
#define INCX16 r21 | |||
#define PR r30 | |||
#define ARLC r31 | |||
#define N r32 | |||
#define X r33 | |||
#define INCX r34 | |||
PROLOGUE | |||
.prologue | |||
PROFCODE | |||
{ .mfi | |||
adds PRE1 = PREFETCH_SIZE * SIZE, X | |||
mov f8 = f0 | |||
.save ar.lc, ARLC | |||
mov ARLC = ar.lc | |||
} | |||
;; | |||
.body | |||
#ifdef F_INTERFACE | |||
{ .mmi | |||
LDINT N = [N] | |||
LDINT INCX = [INCX] | |||
nop.i 0 | |||
} | |||
;; | |||
#ifndef USE64BITINT | |||
{ .mii | |||
nop.m 0 | |||
sxt4 N = N | |||
sxt4 INCX = INCX | |||
} | |||
;; | |||
#endif | |||
#endif | |||
{ .mmi | |||
cmp.lt p0, p6 = r0, INCX | |||
cmp.lt p0, p7 = r0, N | |||
shr I = N, (4 - COMPADD) | |||
} | |||
{ .mbb | |||
and J = ((1 << (4 - COMPADD)) - 1), N | |||
(p6) br.ret.sptk.many b0 | |||
(p7) br.ret.sptk.many b0 | |||
} | |||
;; | |||
{ .mfi | |||
adds I = -1, I | |||
mov f10 = f0 | |||
mov PR = pr | |||
} | |||
{ .mfi | |||
cmp.eq p9, p0 = r0, J | |||
mov f9 = f0 | |||
tbit.z p0, p12 = N, 3 - COMPADD | |||
} | |||
;; | |||
{ .mmi | |||
cmp.eq p16, p0 = r0, r0 | |||
cmp.ne p17, p0 = r0, r0 | |||
mov ar.ec= 3 | |||
} | |||
{ .mfi | |||
cmp.ne p18, p0 = r0, r0 | |||
mov f11 = f0 | |||
shl INCX = INCX, BASE_SHIFT + COMPADD | |||
} | |||
;; | |||
{ .mmi | |||
#ifdef XDOUBLE | |||
shladd INCX16 = INCX, (3 - COMPADD), r0 | |||
#else | |||
shladd INCX16 = INCX, (4 - COMPADD), r0 | |||
#endif | |||
cmp.ne p19, p0 = r0, r0 | |||
mov ar.lc = I | |||
} | |||
{ .mmb | |||
cmp.gt p8 ,p0 = r0, I | |||
#ifdef COMPLEX | |||
adds INCX = - SIZE, INCX | |||
#else | |||
nop.m 0 | |||
#endif | |||
(p8) br.cond.dpnt .L55 | |||
} | |||
;; | |||
.align 32 | |||
.L52: | |||
{ .mmf | |||
(p16) lfetch.nt1 [PRE1], INCX16 | |||
(p16) LDFD f32 = [X], STRIDE | |||
} | |||
{ .mfb | |||
(p19) FADD f8 = f8, f71 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f35 = [X], INCX | |||
} | |||
{ .mfb | |||
(p19) FADD f9 = f9, f74 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f38 = [X], STRIDE | |||
} | |||
{ .mfb | |||
(p19) FADD f10 = f10, f77 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f41 = [X], INCX | |||
} | |||
{ .mfb | |||
(p19) FADD f11 = f11, f80 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f44 = [X], STRIDE | |||
} | |||
{ .mfb | |||
(p18) FADD f8 = f8, f34 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f47 = [X], INCX | |||
} | |||
{ .mfb | |||
(p18) FADD f9 = f9, f37 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f50 = [X], STRIDE | |||
} | |||
{ .mfb | |||
(p18) FADD f10 = f10, f40 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f53 = [X], INCX | |||
} | |||
{ .mfb | |||
(p18) FADD f11 = f11, f43 | |||
} | |||
;; | |||
{ .mmf | |||
#ifdef XDOUBLE | |||
(p16) lfetch.nt1 [PRE1], INCX16 | |||
#endif | |||
(p16) LDFD f56 = [X], STRIDE | |||
} | |||
{ .mfb | |||
(p18) FADD f8 = f8, f46 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f59 = [X], INCX | |||
} | |||
{ .mfb | |||
(p18) FADD f9 = f9, f49 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f62 = [X], STRIDE | |||
} | |||
{ .mfb | |||
(p18) FADD f10 = f10, f52 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f65 = [X], INCX | |||
} | |||
{ .mfb | |||
(p18) FADD f11 = f11, f55 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f68 = [X], STRIDE | |||
} | |||
{ .mfb | |||
(p18) FADD f8 = f8, f58 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f71 = [X], INCX | |||
} | |||
{ .mfb | |||
(p18) FADD f9 = f9, f61 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f74 = [X], STRIDE | |||
} | |||
{ .mfb | |||
(p18) FADD f10 = f10, f64 | |||
} | |||
;; | |||
{ .mmf | |||
(p16) LDFD f77 = [X], INCX | |||
} | |||
{ .mfb | |||
(p18) FADD f11 = f11, f67 | |||
br.ctop.sptk.few .L52 | |||
} | |||
;; | |||
FADD f8 = f8, f71 | |||
FADD f9 = f9, f74 | |||
FADD f10 = f10, f77 | |||
FADD f11 = f11, f80 | |||
.align 32 | |||
;; | |||
.L55: | |||
(p12) LDFD f32 = [X], STRIDE | |||
(p9) br.cond.dptk .L998 | |||
;; | |||
(p12) LDFD f33 = [X], INCX | |||
;; | |||
(p12) LDFD f34 = [X], STRIDE | |||
;; | |||
(p12) LDFD f35 = [X], INCX | |||
tbit.z p0, p13 = N, (2 - COMPADD) | |||
;; | |||
(p12) LDFD f36 = [X], STRIDE | |||
tbit.z p0, p14 = N, (1 - COMPADD) | |||
;; | |||
(p12) LDFD f37 = [X], INCX | |||
#ifndef COMPLEX | |||
tbit.z p0, p15 = N, 0 | |||
#endif | |||
;; | |||
(p12) LDFD f38 = [X], STRIDE | |||
;; | |||
(p12) LDFD f39 = [X], INCX | |||
;; | |||
(p13) LDFD f40 = [X], STRIDE | |||
;; | |||
(p13) LDFD f41 = [X], INCX | |||
;; | |||
(p13) LDFD f42 = [X], STRIDE | |||
(p12) FADD f8 = f8, f32 | |||
;; | |||
(p13) LDFD f43 = [X], INCX | |||
(p12) FADD f9 = f9, f33 | |||
;; | |||
(p14) LDFD f44 = [X], STRIDE | |||
(p12) FADD f10 = f10, f34 | |||
;; | |||
(p14) LDFD f45 = [X], INCX | |||
(p12) FADD f11 = f11, f35 | |||
;; | |||
#ifndef COMPLEX | |||
(p15) LDFD f46 = [X] | |||
#endif | |||
(p12) FADD f8 = f8, f36 | |||
;; | |||
(p12) FADD f9 = f9, f37 | |||
(p12) FADD f10 = f10, f38 | |||
(p12) FADD f11 = f11, f39 | |||
;; | |||
(p13) FADD f8 = f8, f40 | |||
(p13) FADD f9 = f9, f41 | |||
#ifndef COMPLEX | |||
#endif | |||
(p13) FADD f10 = f10, f42 | |||
;; | |||
(p13) FADD f11 = f11, f43 | |||
(p14) FADD f8 = f8, f44 | |||
(p14) FADD f9 = f9, f45 | |||
#ifndef COMPLEX | |||
(p15) FADD f10 = f10, f46 | |||
#endif | |||
;; | |||
.align 32 | |||
.L998: | |||
{ .mfi | |||
FADD f8 = f8, f9 | |||
mov ar.lc = ARLC | |||
} | |||
{ .mmf | |||
FADD f10 = f10, f11 | |||
} | |||
;; | |||
{ .mii | |||
mov pr = PR, -65474 | |||
} | |||
;; | |||
{ .mfb | |||
FADD f8 = f8, f10 | |||
br.ret.sptk.many b0 | |||
} | |||
EPILOGUE |
@@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c | |||
ISMINKERNEL = ../mips/imin.c | |||
IDMINKERNEL = ../mips/imin.c | |||
SSUMKERNEL = ../mips/sum.c | |||
DSUMKERNEL = ../mips/sum.c | |||
CSUMKERNEL = ../mips/zsum.c | |||
ZSUMKERNEL = ../mips/zsum.c | |||
ifdef HAVE_MSA | |||
SASUMKERNEL = ../mips/sasum_msa.c | |||
DASUMKERNEL = ../mips/dasum_msa.c | |||
@@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
while(i < n) | |||
{ | |||
if( x[ix] > minf ) | |||
if( x[ix] < minf ) | |||
{ | |||
min = i; | |||
minf = x[ix]; | |||
@@ -0,0 +1,47 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2016, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i=0; | |||
FLOAT sumf = 0.0; | |||
if (n <= 0 || inc_x <= 0) return(sumf); | |||
n *= inc_x; | |||
while(i < n) | |||
{ | |||
sumf += x[i]; | |||
i += inc_x; | |||
} | |||
return(sumf); | |||
} | |||
@@ -0,0 +1,52 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2016, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <math.h> | |||
#define CSUM1(x,i) x[i]+x[i+1] | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG i=0; | |||
FLOAT sumf = 0.0; | |||
BLASLONG inc_x2; | |||
if (n <= 0 || inc_x <= 0) return(sumf); | |||
inc_x2 = 2 * inc_x; | |||
n *= inc_x2; | |||
while(i < n) | |||
{ | |||
sumf += CSUM1(x,i); | |||
i += inc_x2; | |||
} | |||
return(sumf); | |||
} | |||
@@ -0,0 +1,332 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define N $4 | |||
#define X $5 | |||
#define INCX $6 | |||
#define I $2 | |||
#define TEMP $3 | |||
#define a1 $f2 | |||
#define a2 $f3 | |||
#define a3 $f4 | |||
#define a4 $f5 | |||
#define a5 $f6 | |||
#define a6 $f7 | |||
#define a7 $f8 | |||
#define a8 $f9 | |||
#define t1 $f10 | |||
#define t2 $f11 | |||
#define t3 $f12 | |||
#define t4 $f13 | |||
#define s1 $f0 | |||
#define s2 $f1 | |||
PROLOGUE | |||
#ifdef F_INTERFACE | |||
LDINT N, 0(N) | |||
LDINT INCX, 0(INCX) | |||
#endif | |||
MTC $0, s1 | |||
MTC $0, s2 | |||
dsll INCX, INCX, BASE_SHIFT | |||
blez N, .L999 | |||
li TEMP, SIZE | |||
bne INCX, TEMP, .L20 | |||
dsra I, N, 3 | |||
blez I, .L15 | |||
NOP | |||
LD a1, 0 * SIZE(X) | |||
LD a2, 1 * SIZE(X) | |||
LD a3, 2 * SIZE(X) | |||
LD a4, 3 * SIZE(X) | |||
LD a5, 4 * SIZE(X) | |||
MOV t1, a1 | |||
LD a6, 5 * SIZE(X) | |||
MOV t2, a2 | |||
LD a7, 6 * SIZE(X) | |||
MOV t3, a3 | |||
MOV t4, a4 | |||
daddiu I, I, -1 | |||
blez I, .L13 | |||
LD a8, 7 * SIZE(X) | |||
.align 3 | |||
.L12: | |||
ADD s1, s1, t1 | |||
LD a1, 8 * SIZE(X) | |||
MOV t1, a5 | |||
daddiu I, I, -1 | |||
ADD s2, s2, t2 | |||
LD a2, 9 * SIZE(X) | |||
MOV t2, a6 | |||
NOP | |||
ADD s1, s1, t3 | |||
LD a3, 10 * SIZE(X) | |||
MOV t3, a7 | |||
NOP | |||
ADD s2, s2, t4 | |||
LD a4, 11 * SIZE(X) | |||
MOV t4, a8 | |||
daddiu X, X, 8 * SIZE | |||
ADD s1, s1, t1 | |||
LD a5, 4 * SIZE(X) | |||
MOV t1, a1 | |||
NOP | |||
ADD s2, s2, t2 | |||
LD a6, 5 * SIZE(X) | |||
MOV t2, a2 | |||
NOP | |||
ADD s1, s1, t3 | |||
LD a7, 6 * SIZE(X) | |||
MOV t3, a3 | |||
NOP | |||
ADD s2, s2, t4 | |||
LD a8, 7 * SIZE(X) | |||
bgtz I, .L12 | |||
MOV t4, a4 | |||
.align 3 | |||
.L13: | |||
ADD s1, s1, t1 | |||
daddiu X, X, 8 * SIZE | |||
MOV t1, a5 | |||
NOP | |||
ADD s2, s2, t2 | |||
MOV t2, a6 | |||
ADD s1, s1, t3 | |||
MOV t3, a7 | |||
ADD s2, s2, t4 | |||
MOV t4, a8 | |||
ADD s1, s1, t1 | |||
ADD s2, s2, t2 | |||
ADD s1, s1, t3 | |||
ADD s2, s2, t4 | |||
.align 3 | |||
.L15: | |||
andi I, N, 7 | |||
blez I, .L999 | |||
NOP | |||
.align 3 | |||
.L16: | |||
LD a1, 0 * SIZE(X) | |||
daddiu I, I, -1 | |||
MOV t1, a1 | |||
ADD s1, s1, t1 | |||
bgtz I, .L16 | |||
daddiu X, X, SIZE | |||
j .L999 | |||
NOP | |||
.align 3 | |||
.L20: | |||
blez I, .L25 | |||
NOP | |||
LD a1, 0 * SIZE(X) | |||
daddu X, X, INCX | |||
LD a2, 0 * SIZE(X) | |||
daddu X, X, INCX | |||
LD a3, 0 * SIZE(X) | |||
daddu X, X, INCX | |||
LD a4, 0 * SIZE(X) | |||
daddu X, X, INCX | |||
LD a5, 0 * SIZE(X) | |||
daddu X, X, INCX | |||
LD a6, 0 * SIZE(X) | |||
daddu X, X, INCX | |||
MOV t1, a1 | |||
LD a7, 0 * SIZE(X) | |||
MOV t2, a2 | |||
daddu X, X, INCX | |||
MOV t3, a3 | |||
LD a8, 0 * SIZE(X) | |||
MOV t4, a4 | |||
daddiu I, I, -1 | |||
blez I, .L24 | |||
daddu X, X, INCX | |||
.align 3 | |||
.L23: | |||
ADD s1, s1, t1 | |||
LD a1, 0 * SIZE(X) | |||
MOV t1, a5 | |||
daddu X, X, INCX | |||
ADD s2, s2, t2 | |||
LD a2, 0 * SIZE(X) | |||
MOV t2, a6 | |||
daddu X, X, INCX | |||
ADD s1, s1, t3 | |||
LD a3, 0 * SIZE(X) | |||
MOV t3, a7 | |||
daddu X, X, INCX | |||
ADD s2, s2, t4 | |||
LD a4, 0 * SIZE(X) | |||
MOV t4, a8 | |||
daddu X, X, INCX | |||
ADD s1, s1, t1 | |||
LD a5, 0 * SIZE(X) | |||
MOV t1, a1 | |||
daddu X, X, INCX | |||
ADD s2, s2, t2 | |||
LD a6, 0 * SIZE(X) | |||
MOV t2, a2 | |||
daddu X, X, INCX | |||
ADD s1, s1, t3 | |||
LD a7, 0 * SIZE(X) | |||
MOV t3, a3 | |||
daddu X, X, INCX | |||
ADD s2, s2, t4 | |||
LD a8, 0 * SIZE(X) | |||
MOV t4, a4 | |||
daddiu I, I, -1 | |||
bgtz I, .L23 | |||
daddu X, X, INCX | |||
.align 3 | |||
.L24: | |||
ADD s1, s1, t1 | |||
MOV t1, a5 | |||
ADD s2, s2, t2 | |||
MOV t2, a6 | |||
ADD s1, s1, t3 | |||
MOV t3, a7 | |||
ADD s2, s2, t4 | |||
MOV t4, a8 | |||
ADD s1, s1, t1 | |||
ADD s2, s2, t2 | |||
ADD s1, s1, t3 | |||
ADD s2, s2, t4 | |||
.align 3 | |||
.L25: | |||
andi I, N, 7 | |||
blez I, .L999 | |||
NOP | |||
.align 3 | |||
.L26: | |||
LD a1, 0 * SIZE(X) | |||
daddiu I, I, -1 | |||
MOV t1, a1 | |||
daddu X, X, INCX | |||
bgtz I, .L26 | |||
ADD s1, s1, t1 | |||
.align 3 | |||
.L999: | |||
j $31 | |||
ADD s1, s1, s2 | |||
EPILOGUE |
@@ -0,0 +1,204 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define N $4 | |||
#define X $5 | |||
#define INCX $6 | |||
#define I $2 | |||
#define TEMP $3 | |||
#define a1 $f2 | |||
#define a2 $f3 | |||
#define a3 $f4 | |||
#define a4 $f5 | |||
#define a5 $f6 | |||
#define a6 $f7 | |||
#define a7 $f8 | |||
#define a8 $f9 | |||
#define t1 $f10 | |||
#define t2 $f11 | |||
#define t3 $f12 | |||
#define t4 $f13 | |||
#define s1 $f0 | |||
#define s2 $f1 | |||
PROLOGUE | |||
#ifdef F_INTERFACE | |||
LDINT N, 0(N) | |||
LDINT INCX, 0(INCX) | |||
#endif | |||
MTC $0, s1 | |||
MTC $0, s2 | |||
dsll INCX, INCX, ZBASE_SHIFT | |||
blez N, .L999 | |||
dsra I, N, 2 | |||
blez I, .L25 | |||
NOP | |||
LD a1, 0 * SIZE(X) | |||
LD a2, 1 * SIZE(X) | |||
daddu X, X, INCX | |||
LD a3, 0 * SIZE(X) | |||
LD a4, 1 * SIZE(X) | |||
daddu X, X, INCX | |||
LD a5, 0 * SIZE(X) | |||
LD a6, 1 * SIZE(X) | |||
daddu X, X, INCX | |||
MOV t1, a1 | |||
MOV t2, a2 | |||
LD a7, 0 * SIZE(X) | |||
LD a8, 1 * SIZE(X) | |||
MOV t3, a3 | |||
MOV t4, a4 | |||
daddiu I, I, -1 | |||
blez I, .L24 | |||
daddu X, X, INCX | |||
.align 3 | |||
.L23: | |||
ADD s1, s1, t1 | |||
LD a1, 0 * SIZE(X) | |||
MOV t1, a5 | |||
daddiu I, I, -1 | |||
ADD s2, s2, t2 | |||
LD a2, 1 * SIZE(X) | |||
MOV t2, a6 | |||
daddu X, X, INCX | |||
ADD s1, s1, t3 | |||
LD a3, 0 * SIZE(X) | |||
MOV t3, a7 | |||
NOP | |||
ADD s2, s2, t4 | |||
LD a4, 1 * SIZE(X) | |||
MOV t4, a8 | |||
daddu X, X, INCX | |||
ADD s1, s1, t1 | |||
LD a5, 0 * SIZE(X) | |||
MOV t1, a1 | |||
NOP | |||
ADD s2, s2, t2 | |||
LD a6, 1 * SIZE(X) | |||
MOV t2, a2 | |||
daddu X, X, INCX | |||
ADD s1, s1, t3 | |||
LD a7, 0 * SIZE(X) | |||
MOV t3, a3 | |||
LD a8, 1 * SIZE(X) | |||
ADD s2, s2, t4 | |||
daddu X, X, INCX | |||
bgtz I, .L23 | |||
MOV t4, a4 | |||
.align 3 | |||
.L24: | |||
ADD s1, s1, t1 | |||
MOV t1, a5 | |||
ADD s2, s2, t2 | |||
MOV t2, a6 | |||
ADD s1, s1, t3 | |||
MOV t3, a7 | |||
ADD s2, s2, t4 | |||
MOV t4, a8 | |||
ADD s1, s1, t1 | |||
ADD s2, s2, t2 | |||
ADD s1, s1, t3 | |||
ADD s2, s2, t4 | |||
.align 3 | |||
.L25: | |||
andi I, N, 3 | |||
blez I, .L999 | |||
NOP | |||
.align 3 | |||
.L26: | |||
LD a1, 0 * SIZE(X) | |||
LD a2, 1 * SIZE(X) | |||
MOV t1, a1 | |||
daddiu I, I, -1 | |||
MOV t2, a2 | |||
daddu X, X, INCX | |||
ADD s1, s1, t1 | |||
bgtz I, .L26 | |||
ADD s2, s2, t2 | |||
.align 3 | |||
.L999: | |||
j $31 | |||
ADD s1, s1, s2 | |||
EPILOGUE |
@@ -13,40 +13,40 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
SGEMMINCOPYOBJ = sgemm_incopy.o | |||
SGEMMITCOPYOBJ = sgemm_itcopy.o | |||
SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = dgemm_kernel_16x4_power8.S | |||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
DGEMMINCOPYOBJ = dgemm_incopy.o | |||
DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
CGEMMITCOPY = cgemm_tcopy_8_power8.S | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
CGEMMINCOPYOBJ = cgemm_incopy.o | |||
CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||
ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
ZGEMMINCOPYOBJ = zgemm_incopy.o | |||
ZGEMMITCOPYOBJ = zgemm_itcopy.o | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
@@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
#SMINKERNEL = ../arm/min.c | |||
#DMINKERNEL = ../arm/min.c | |||
# | |||
#ISAMAXKERNEL = ../arm/iamax.c | |||
ISAMAXKERNEL = isamax.c | |||
IDAMAXKERNEL = idamax.c | |||
#ICAMAXKERNEL = ../arm/izamax.c | |||
IZAMAXKERNEL = izamax.c | |||
ICAMAXKERNEL = icamax.c | |||
IZAMAXKERNEL = izamax.c | |||
# | |||
#ISAMINKERNEL = ../arm/iamin.c | |||
IDAMINKERNEL = idamin.c | |||
#ICAMINKERNEL = ../arm/izamin.c | |||
ISAMINKERNEL = isamin.c | |||
IDAMINKERNEL = idamin.c | |||
ICAMINKERNEL = icamin.c | |||
IZAMINKERNEL = izamin.c | |||
# | |||
#ISMAXKERNEL = ../arm/imax.c | |||
@@ -110,9 +110,9 @@ DASUMKERNEL = dasum.c | |||
CASUMKERNEL = casum.c | |||
ZASUMKERNEL = zasum.c | |||
# | |||
#SAXPYKERNEL = ../arm/axpy.c | |||
SAXPYKERNEL = saxpy.c | |||
DAXPYKERNEL = daxpy.c | |||
#CAXPYKERNEL = ../arm/zaxpy.c | |||
CAXPYKERNEL = caxpy.c | |||
ZAXPYKERNEL = zaxpy.c | |||
# | |||
SCOPYKERNEL = scopy.c | |||
@@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c | |||
SDOTKERNEL = sdot.c | |||
DDOTKERNEL = ddot.c | |||
DSDOTKERNEL = sdot.c | |||
#CDOTKERNEL = ../arm/zdot.c | |||
CDOTKERNEL = cdot.c | |||
ZDOTKERNEL = zdot.c | |||
# | |||
SNRM2KERNEL = ../arm/nrm2.c | |||
@@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c | |||
# | |||
SROTKERNEL = srot.c | |||
DROTKERNEL = drot.c | |||
CROTKERNEL = zrot.c | |||
CROTKERNEL = crot.c | |||
ZROTKERNEL = zrot.c | |||
# | |||
SSCALKERNEL = sscal.c | |||
@@ -147,14 +147,14 @@ CSWAPKERNEL = cswap.c | |||
ZSWAPKERNEL = zswap.c | |||
# | |||
#SGEMVNKERNEL = ../arm/gemv_n.c | |||
SGEMVNKERNEL = sgemv_n.c | |||
DGEMVNKERNEL = dgemv_n.c | |||
#CGEMVNKERNEL = ../arm/zgemv_n.c | |||
CGEMVNKERNEL = cgemv_n.c | |||
ZGEMVNKERNEL = zgemv_n_4.c | |||
# | |||
#SGEMVTKERNEL = ../arm/gemv_t.c | |||
SGEMVTKERNEL = sgemv_t.c | |||
DGEMVTKERNEL = dgemv_t.c | |||
#CGEMVTKERNEL = ../arm/zgemv_t.c | |||
CGEMVTKERNEL = cgemv_t.c | |||
ZGEMVTKERNEL = zgemv_t_4.c | |||
@@ -0,0 +1,184 @@ | |||
#SGEMM_BETA = ../generic/gemm_beta.c | |||
#DGEMM_BETA = ../generic/gemm_beta.c | |||
#CGEMM_BETA = ../generic/zgemm_beta.c | |||
#ZGEMM_BETA = ../generic/zgemm_beta.c | |||
STRMMKERNEL = strmm_kernel_16x8_power8.S | |||
DTRMMKERNEL = dgemm_kernel_power9.S | |||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||
SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = dgemm_kernel_power9.S | |||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
CGEMMITCOPY = cgemm_tcopy_8_power8.S | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. | |||
#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S | |||
#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S | |||
#Pure C for other kernels | |||
#SAMAXKERNEL = ../arm/amax.c | |||
#DAMAXKERNEL = ../arm/amax.c | |||
#CAMAXKERNEL = ../arm/zamax.c | |||
#ZAMAXKERNEL = ../arm/zamax.c | |||
# | |||
#SAMINKERNEL = ../arm/amin.c | |||
#DAMINKERNEL = ../arm/amin.c | |||
#CAMINKERNEL = ../arm/zamin.c | |||
#ZAMINKERNEL = ../arm/zamin.c | |||
# | |||
#SMAXKERNEL = ../arm/max.c | |||
#DMAXKERNEL = ../arm/max.c | |||
# | |||
#SMINKERNEL = ../arm/min.c | |||
#DMINKERNEL = ../arm/min.c | |||
# | |||
ISAMAXKERNEL = isamax.c | |||
IDAMAXKERNEL = idamax.c | |||
ICAMAXKERNEL = icamax.c | |||
IZAMAXKERNEL = izamax.c | |||
# | |||
ISAMINKERNEL = isamin.c | |||
IDAMINKERNEL = idamin.c | |||
ICAMINKERNEL = icamin.c | |||
IZAMINKERNEL = izamin.c | |||
# | |||
#ISMAXKERNEL = ../arm/imax.c | |||
#IDMAXKERNEL = ../arm/imax.c | |||
# | |||
#ISMINKERNEL = ../arm/imin.c | |||
#IDMINKERNEL = ../arm/imin.c | |||
# | |||
SASUMKERNEL = sasum.c | |||
DASUMKERNEL = dasum.c | |||
CASUMKERNEL = casum.c | |||
ZASUMKERNEL = zasum.c | |||
# | |||
SAXPYKERNEL = saxpy.c | |||
DAXPYKERNEL = daxpy.c | |||
CAXPYKERNEL = caxpy.c | |||
ZAXPYKERNEL = zaxpy.c | |||
# | |||
SCOPYKERNEL = scopy.c | |||
DCOPYKERNEL = dcopy.c | |||
CCOPYKERNEL = ccopy.c | |||
ZCOPYKERNEL = zcopy.c | |||
# | |||
SDOTKERNEL = sdot.c | |||
DDOTKERNEL = ddot.c | |||
DSDOTKERNEL = sdot.c | |||
CDOTKERNEL = cdot.c | |||
ZDOTKERNEL = zdot.c | |||
# | |||
SNRM2KERNEL = ../arm/nrm2.c | |||
DNRM2KERNEL = ../arm/nrm2.c | |||
CNRM2KERNEL = ../arm/znrm2.c | |||
ZNRM2KERNEL = ../arm/znrm2.c | |||
# | |||
SROTKERNEL = srot.c | |||
DROTKERNEL = drot.c | |||
CROTKERNEL = crot.c | |||
ZROTKERNEL = zrot.c | |||
# | |||
SSCALKERNEL = sscal.c | |||
DSCALKERNEL = dscal.c | |||
CSCALKERNEL = zscal.c | |||
ZSCALKERNEL = zscal.c | |||
# | |||
SSWAPKERNEL = sswap.c | |||
DSWAPKERNEL = dswap.c | |||
CSWAPKERNEL = cswap.c | |||
ZSWAPKERNEL = zswap.c | |||
# | |||
SGEMVNKERNEL = sgemv_n.c | |||
DGEMVNKERNEL = dgemv_n.c | |||
CGEMVNKERNEL = cgemv_n.c | |||
ZGEMVNKERNEL = zgemv_n_4.c | |||
# | |||
SGEMVTKERNEL = sgemv_t.c | |||
DGEMVTKERNEL = dgemv_t.c | |||
CGEMVTKERNEL = cgemv_t.c | |||
ZGEMVTKERNEL = zgemv_t_4.c | |||
#SSYMV_U_KERNEL = ../generic/symv_k.c | |||
#SSYMV_L_KERNEL = ../generic/symv_k.c | |||
#DSYMV_U_KERNEL = ../generic/symv_k.c | |||
#DSYMV_L_KERNEL = ../generic/symv_k.c | |||
#QSYMV_U_KERNEL = ../generic/symv_k.c | |||
#QSYMV_L_KERNEL = ../generic/symv_k.c | |||
#CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
#CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
#ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
#ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
#XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
#XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
#ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||
#ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
LSAME_KERNEL = ../generic/lsame.c | |||
SCABS_KERNEL = ../generic/cabs.c | |||
DCABS_KERNEL = ../generic/cabs.c | |||
QCABS_KERNEL = ../generic/cabs.c | |||
#Dump kernel | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c |
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
#if defined(POWER8) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "casum_microk_power8.c" | |||
#endif | |||
@@ -0,0 +1,145 @@ | |||
/* | |||
Copyright (c) 2013-2018, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#ifndef HAVE_ASM_KERNEL | |||
#include <altivec.h> | |||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) | |||
{ | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r}; | |||
register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i}; | |||
#else | |||
register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r}; | |||
register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; | |||
#endif | |||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
register __vector float *vy = (__vector float *) y; | |||
register __vector float *vx = (__vector float *) x; | |||
BLASLONG i=0; | |||
for (; i < n/2; i += 8) { | |||
register __vector float vy_0 = vy[i]; | |||
register __vector float vy_1 = vy[i + 1]; | |||
register __vector float vy_2 = vy[i + 2]; | |||
register __vector float vy_3 = vy[i + 3]; | |||
register __vector float vy_4 = vy[i + 4]; | |||
register __vector float vy_5 = vy[i + 5]; | |||
register __vector float vy_6 = vy[i + 6]; | |||
register __vector float vy_7 = vy[i + 7]; | |||
register __vector float vx_0 = vx[i]; | |||
register __vector float vx_1 = vx[i + 1]; | |||
register __vector float vx_2 = vx[i + 2]; | |||
register __vector float vx_3 = vx[i + 3]; | |||
register __vector float vx_4 = vx[i + 4]; | |||
register __vector float vx_5 = vx[i + 5]; | |||
register __vector float vx_6 = vx[i + 6]; | |||
register __vector float vx_7 = vx[i + 7]; | |||
vy_0 += vx_0*valpha_r; | |||
vy_1 += vx_1*valpha_r; | |||
vy_2 += vx_2*valpha_r; | |||
vy_3 += vx_3*valpha_r; | |||
vy_4 += vx_4*valpha_r; | |||
vy_5 += vx_5*valpha_r; | |||
vy_6 += vx_6*valpha_r; | |||
vy_7 += vx_7*valpha_r; | |||
vx_0 = vec_perm(vx_0, vx_0, swap_mask); | |||
vx_1 = vec_perm(vx_1, vx_1, swap_mask); | |||
vx_2 = vec_perm(vx_2, vx_2, swap_mask); | |||
vx_3 = vec_perm(vx_3, vx_3, swap_mask); | |||
vx_4 = vec_perm(vx_4, vx_4, swap_mask); | |||
vx_5 = vec_perm(vx_5, vx_5, swap_mask); | |||
vx_6 = vec_perm(vx_6, vx_6, swap_mask); | |||
vx_7 = vec_perm(vx_7, vx_7, swap_mask); | |||
vy_0 += vx_0*valpha_i; | |||
vy_1 += vx_1*valpha_i; | |||
vy_2 += vx_2*valpha_i; | |||
vy_3 += vx_3*valpha_i; | |||
vy_4 += vx_4*valpha_i; | |||
vy_5 += vx_5*valpha_i; | |||
vy_6 += vx_6*valpha_i; | |||
vy_7 += vx_7*valpha_i; | |||
vy[i] = vy_0; | |||
vy[i + 1] = vy_1; | |||
vy[i + 2] = vy_2; | |||
vy[i + 3] = vy_3; | |||
vy[i + 4] = vy_4; | |||
vy[i + 5] = vy_5 ; | |||
vy[i + 6] = vy_6 ; | |||
vy[i + 7] = vy_7 ; | |||
} | |||
} | |||
#endif | |||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||
BLASLONG i = 0; | |||
BLASLONG ix = 0, iy = 0; | |||
if (n <= 0) return (0); | |||
if ((inc_x == 1) && (inc_y == 1)) { | |||
BLASLONG n1 = n & -16; | |||
if (n1) { | |||
caxpy_kernel_16(n1, x, y, da_r,da_i); | |||
ix = 2 * n1; | |||
} | |||
i = n1; | |||
while (i < n) { | |||
#if !defined(CONJ) | |||
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); | |||
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||
#else | |||
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); | |||
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||
#endif | |||
i++; | |||
ix += 2; | |||
} | |||
return (0); | |||
} | |||
inc_x *= 2; | |||
inc_y *= 2; | |||
while (i < n) { | |||
#if !defined(CONJ) | |||
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); | |||
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||
#else | |||
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); | |||
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||
#endif | |||
ix += inc_x; | |||
iy += inc_y; | |||
i++; | |||
} | |||
return (0); | |||
} | |||
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include "common.h" | |||
#if defined(POWER8) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "ccopy_microk_power8.c" | |||
#endif | |||
@@ -0,0 +1,164 @@ | |||
/*Copyright (c) 2013-201\n8, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#ifndef HAVE_KERNEL_8 | |||
#include <altivec.h> | |||
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | |||
{ | |||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
register __vector float *vy = (__vector float *) y; | |||
register __vector float *vx = (__vector float *) x; | |||
BLASLONG i = 0; | |||
register __vector float vd_0 = { 0 }; | |||
register __vector float vd_1 = { 0 }; | |||
register __vector float vd_2 = { 0 }; | |||
register __vector float vd_3 = { 0 }; | |||
register __vector float vdd_0 = { 0 }; | |||
register __vector float vdd_1 = { 0 }; | |||
register __vector float vdd_2 = { 0 }; | |||
register __vector float vdd_3 = { 0 }; | |||
for (; i < n/2; i += 4) { | |||
register __vector float vyy_0 ; | |||
register __vector float vyy_1 ; | |||
register __vector float vyy_2 ; | |||
register __vector float vyy_3 ; | |||
register __vector float vy_0 = vy[i]; | |||
register __vector float vy_1 = vy[i + 1]; | |||
register __vector float vy_2 = vy[i + 2]; | |||
register __vector float vy_3 = vy[i + 3]; | |||
register __vector float vx_0= vx[i]; | |||
register __vector float vx_1 = vx[i + 1]; | |||
register __vector float vx_2 = vx[i + 2]; | |||
register __vector float vx_3 = vx[i + 3]; | |||
vyy_0 = vec_perm(vy_0, vy_0, swap_mask); | |||
vyy_1 = vec_perm(vy_1, vy_1, swap_mask); | |||
vyy_2 = vec_perm(vy_2, vy_2, swap_mask); | |||
vyy_3 = vec_perm(vy_3, vy_3, swap_mask); | |||
vd_0 += vx_0 * vy_0; | |||
vd_1 += vx_1 * vy_1; | |||
vd_2 += vx_2 * vy_2; | |||
vd_3 += vx_3 * vy_3; | |||
vdd_0 += vx_0 * vyy_0; | |||
vdd_1 += vx_1 * vyy_1; | |||
vdd_2 += vx_2 * vyy_2; | |||
vdd_3 += vx_3 * vyy_3; | |||
} | |||
//aggregate | |||
vd_0 = vd_0 + vd_1 +vd_2 +vd_3; | |||
vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3; | |||
//reverse and aggregate | |||
vd_1=vec_xxpermdi(vd_0,vd_0,2) ; | |||
vdd_1=vec_xxpermdi(vdd_0,vdd_0,2); | |||
vd_2=vd_0+vd_1; | |||
vdd_2=vdd_0+vdd_1; | |||
dot[0]=vd_2[0]; | |||
dot[1]=vd_2[1]; | |||
dot[2]=vdd_2[0]; | |||
dot[3]=vdd_2[1]; | |||
} | |||
#endif | |||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||
BLASLONG i = 0; | |||
BLASLONG ix=0, iy=0; | |||
OPENBLAS_COMPLEX_FLOAT result; | |||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | |||
if (n <= 0) { | |||
CREAL(result) = 0.0; | |||
CIMAG(result) = 0.0; | |||
return (result); | |||
} | |||
if ((inc_x == 1) && (inc_y == 1)) { | |||
BLASLONG n1 = n & -8; | |||
BLASLONG j=0; | |||
if (n1){ | |||
cdot_kernel_8(n1, x, y, dot); | |||
i = n1; | |||
j = n1 <<1; | |||
} | |||
while (i < n) { | |||
dot[0] += x[j] * y[j]; | |||
dot[1] += x[j + 1] * y[j + 1]; | |||
dot[2] += x[j] * y[j + 1]; | |||
dot[3] += x[j + 1] * y[j]; | |||
j += 2; | |||
i++; | |||
} | |||
} else { | |||
i = 0; | |||
ix = 0; | |||
iy = 0; | |||
inc_x <<= 1; | |||
inc_y <<= 1; | |||
while (i < n) { | |||
dot[0] += x[ix] * y[iy]; | |||
dot[1] += x[ix + 1] * y[iy + 1]; | |||
dot[2] += x[ix] * y[iy + 1]; | |||
dot[3] += x[ix + 1] * y[iy]; | |||
ix += inc_x; | |||
iy += inc_y; | |||
i++; | |||
} | |||
} | |||
#if !defined(CONJ) | |||
CREAL(result) = dot[0] - dot[1]; | |||
CIMAG(result) = dot[2] + dot[3]; | |||
#else | |||
CREAL(result) = dot[0] + dot[1]; | |||
CIMAG(result) = dot[2] - dot[3]; | |||
#endif | |||
return (result); | |||
} |
@@ -0,0 +1,585 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <stdlib.h> | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <altivec.h> | |||
#define NBMAX 1024 | |||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||
FLOAT *a0, *a1, *a2, *a3; | |||
a0 = ap; | |||
a1 = ap + lda; | |||
a2 = a1 + lda; | |||
a3 = a2 + lda; | |||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; | |||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; | |||
register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; | |||
register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; | |||
register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; | |||
register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; | |||
register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; | |||
register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; | |||
#else | |||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | |||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | |||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; | |||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; | |||
register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; | |||
register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; | |||
register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; | |||
register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; | |||
#endif | |||
register __vector float *vy = (__vector float *) y; | |||
register __vector float *vptr_a0 = (__vector float *) a0; | |||
register __vector float *vptr_a1 = (__vector float *) a1; | |||
register __vector float *vptr_a2 = (__vector float *) a2; | |||
register __vector float *vptr_a3 = (__vector float *) a3; | |||
BLASLONG i = 0; | |||
for (;i< n / 2; i+=2) { | |||
register __vector float vy_0 = vy[i]; | |||
register __vector float vy_1 = vy[i + 1]; | |||
register __vector float va0 = vptr_a0[i]; | |||
register __vector float va1 = vptr_a1[i]; | |||
register __vector float va2 = vptr_a2[i]; | |||
register __vector float va3 = vptr_a3[i]; | |||
register __vector float va0_1 = vptr_a0[i + 1]; | |||
register __vector float va1_1 = vptr_a1[i + 1]; | |||
register __vector float va2_1 = vptr_a2[i + 1]; | |||
register __vector float va3_1 = vptr_a3[i + 1]; | |||
vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; | |||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; | |||
va0 = vec_perm(va0, va0,swap_mask); | |||
va0_1 = vec_perm(va0_1, va0_1,swap_mask); | |||
va1 = vec_perm(va1, va1,swap_mask); | |||
va1_1 = vec_perm(va1_1, va1_1,swap_mask); | |||
va2 = vec_perm(va2, va2,swap_mask); | |||
va2_1 = vec_perm(va2_1, va2_1,swap_mask); | |||
va3 = vec_perm(va3, va3,swap_mask); | |||
va3_1 = vec_perm(va3_1, va3_1,swap_mask); | |||
vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; | |||
vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; | |||
vy[i] = vy_0; | |||
vy[i + 1] = vy_1; | |||
} | |||
} | |||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||
FLOAT *a0, *a1; | |||
a0 = ap; | |||
a1 = ap + lda; | |||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; | |||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; | |||
register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; | |||
register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; | |||
#else | |||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | |||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | |||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; | |||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; | |||
#endif | |||
register __vector float *vy = (__vector float *) y; | |||
register __vector float *vptr_a0 = (__vector float *) a0; | |||
register __vector float *vptr_a1 = (__vector float *) a1; | |||
BLASLONG i = 0; | |||
for (;i< n / 2; i+=2) { | |||
register __vector float vy_0 = vy[i]; | |||
register __vector float vy_1 = vy[i + 1]; | |||
register __vector float va0 = vptr_a0[i]; | |||
register __vector float va1 = vptr_a1[i]; | |||
register __vector float va0_1 = vptr_a0[i + 1]; | |||
register __vector float va1_1 = vptr_a1[i + 1]; | |||
register __vector float va0x = vec_perm(va0, va0,swap_mask); | |||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); | |||
register __vector float va1x = vec_perm(va1, va1,swap_mask); | |||
register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); | |||
vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; | |||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; | |||
vy[i] = vy_0; | |||
vy[i + 1] = vy_1; | |||
} | |||
} | |||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; | |||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; | |||
#else | |||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | |||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | |||
#endif | |||
register __vector float *vy = (__vector float *) y; | |||
register __vector float *vptr_a0 = (__vector float *) ap; | |||
BLASLONG i = 0; | |||
for (;i< n / 2; i+=2) { | |||
register __vector float vy_0 = vy[i]; | |||
register __vector float vy_1 = vy[i + 1]; | |||
register __vector float va0 = vptr_a0[i]; | |||
register __vector float va0_1 = vptr_a0[i + 1]; | |||
register __vector float va0x = vec_perm(va0, va0,swap_mask); | |||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); | |||
vy_0 += va0*vx0_r + va0x*vx0_i; | |||
vy_1 += va0_1*vx0_r + va0x_1*vx0_i; | |||
vy[i] = vy_0; | |||
vy[i + 1] = vy_1; | |||
} | |||
} | |||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { | |||
BLASLONG i; | |||
if (inc_dest != 2) { | |||
FLOAT temp_r; | |||
FLOAT temp_i; | |||
for ( i=0; i<n; i++ ) | |||
{ | |||
#if !defined(XCONJ) | |||
temp_r = alpha_r * src[0] - alpha_i * src[1]; | |||
temp_i = alpha_r * src[1] + alpha_i * src[0]; | |||
#else | |||
temp_r = alpha_r * src[0] + alpha_i * src[1]; | |||
temp_i = -alpha_r * src[1] + alpha_i * src[0]; | |||
#endif | |||
*dest += temp_r; | |||
*(dest+1) += temp_i; | |||
src+=2; | |||
dest += inc_dest; | |||
} | |||
return; | |||
} else { | |||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
#if !defined(XCONJ) | |||
register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r}; | |||
register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i}; | |||
#else | |||
register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r}; | |||
register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i}; | |||
#endif | |||
register __vector float *vptr_src = (__vector float *) src; | |||
register __vector float *vptr_y = (__vector float *) dest; | |||
for (i = 0; i < n/2; i += 2 ){ | |||
register __vector float vy_0 = vptr_y[i]; | |||
register __vector float vy_1 = vptr_y[i +1]; | |||
register __vector float vsrc = vptr_src[i]; | |||
register __vector float vsrc_1 = vptr_src[i + 1]; | |||
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask); | |||
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask); | |||
vy_0 += vsrc*valpha_r + vsrcx*valpha_i; | |||
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i; | |||
vptr_y[i] = vy_0; | |||
vptr_y[i+1 ] = vy_1; | |||
} | |||
} | |||
return; | |||
} | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { | |||
BLASLONG i; | |||
FLOAT *a_ptr; | |||
FLOAT *x_ptr; | |||
FLOAT *y_ptr; | |||
BLASLONG n1; | |||
BLASLONG m1; | |||
BLASLONG m2; | |||
BLASLONG m3; | |||
BLASLONG n2; | |||
FLOAT xbuffer[8], *ybuffer; | |||
if (m < 1) return (0); | |||
if (n < 1) return (0); | |||
ybuffer = buffer; | |||
inc_x *= 2; | |||
inc_y *= 2; | |||
lda *= 2; | |||
n1 = n / 4; | |||
n2 = n % 4; | |||
m3 = m % 4; | |||
m1 = m - (m % 4); | |||
m2 = (m % NBMAX) - (m % 4); | |||
y_ptr = y; | |||
BLASLONG NB = NBMAX; | |||
while (NB == NBMAX) { | |||
m1 -= NB; | |||
if (m1 < 0) { | |||
if (m2 == 0) break; | |||
NB = m2; | |||
} | |||
a_ptr = a; | |||
x_ptr = x; | |||
memset(ybuffer, 0, NB * 2*sizeof(FLOAT)); | |||
if (inc_x == 2) { | |||
for (i = 0; i < n1; i++) { | |||
cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); | |||
a_ptr += lda << 2; | |||
x_ptr += 8; | |||
} | |||
if (n2 & 2) { | |||
cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); | |||
x_ptr += 4; | |||
a_ptr += 2 * lda; | |||
} | |||
if (n2 & 1) { | |||
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); | |||
x_ptr += 2; | |||
a_ptr += lda; | |||
} | |||
} else { | |||
for (i = 0; i < n1; i++) { | |||
xbuffer[0] = x_ptr[0]; | |||
xbuffer[1] = x_ptr[1]; | |||
x_ptr += inc_x; | |||
xbuffer[2] = x_ptr[0]; | |||
xbuffer[3] = x_ptr[1]; | |||
x_ptr += inc_x; | |||
xbuffer[4] = x_ptr[0]; | |||
xbuffer[5] = x_ptr[1]; | |||
x_ptr += inc_x; | |||
xbuffer[6] = x_ptr[0]; | |||
xbuffer[7] = x_ptr[1]; | |||
x_ptr += inc_x; | |||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer); | |||
a_ptr += lda << 2; | |||
} | |||
for (i = 0; i < n2; i++) { | |||
xbuffer[0] = x_ptr[0]; | |||
xbuffer[1] = x_ptr[1]; | |||
x_ptr += inc_x; | |||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); | |||
a_ptr += lda; | |||
} | |||
} | |||
add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); | |||
a += 2 * NB; | |||
y_ptr += NB * inc_y; | |||
} | |||
if (m3 == 0) return (0); | |||
if (m3 == 1) { | |||
a_ptr = a; | |||
x_ptr = x; | |||
FLOAT temp_r = 0.0; | |||
FLOAT temp_i = 0.0; | |||
if (lda == 2 && inc_x == 2) { | |||
for (i = 0; i < (n & -2); i += 2) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; | |||
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; | |||
#else | |||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; | |||
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; | |||
#endif | |||
a_ptr += 4; | |||
x_ptr += 4; | |||
} | |||
for (; i < n; i++) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
#else | |||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
#endif | |||
a_ptr += 2; | |||
x_ptr += 2; | |||
} | |||
} else { | |||
for (i = 0; i < n; i++) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
#else | |||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
#endif | |||
a_ptr += lda; | |||
x_ptr += inc_x; | |||
} | |||
} | |||
#if !defined(XCONJ) | |||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
#else | |||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
#endif | |||
return (0); | |||
} | |||
if (m3 == 2) { | |||
a_ptr = a; | |||
x_ptr = x; | |||
FLOAT temp_r0 = 0.0; | |||
FLOAT temp_i0 = 0.0; | |||
FLOAT temp_r1 = 0.0; | |||
FLOAT temp_i1 = 0.0; | |||
if (lda == 4 && inc_x == 2) { | |||
for (i = 0; i < (n & -2); i += 2) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; | |||
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; | |||
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; | |||
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; | |||
#else | |||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; | |||
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; | |||
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; | |||
#endif | |||
a_ptr += 8; | |||
x_ptr += 4; | |||
} | |||
for (; i < n; i++) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
#else | |||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
#endif | |||
a_ptr += 4; | |||
x_ptr += 2; | |||
} | |||
} else { | |||
for (i = 0; i < n; i++) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
#else | |||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
#endif | |||
a_ptr += lda; | |||
x_ptr += inc_x; | |||
} | |||
} | |||
#if !defined(XCONJ) | |||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
#else | |||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
#endif | |||
return (0); | |||
} | |||
if (m3 == 3) { | |||
a_ptr = a; | |||
x_ptr = x; | |||
FLOAT temp_r0 = 0.0; | |||
FLOAT temp_i0 = 0.0; | |||
FLOAT temp_r1 = 0.0; | |||
FLOAT temp_i1 = 0.0; | |||
FLOAT temp_r2 = 0.0; | |||
FLOAT temp_i2 = 0.0; | |||
if (lda == 6 && inc_x == 2) { | |||
for (i = 0; i < n; i++) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||
#else | |||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||
#endif | |||
a_ptr += 6; | |||
x_ptr += 2; | |||
} | |||
} else { | |||
for (i = 0; i < n; i++) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||
#else | |||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||
#endif | |||
a_ptr += lda; | |||
x_ptr += inc_x; | |||
} | |||
} | |||
#if !defined(XCONJ) | |||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||
#else | |||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||
#endif | |||
return (0); | |||
} | |||
return (0); | |||
} | |||
@@ -0,0 +1,571 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#define NBMAX 1024 | |||
#include <altivec.h> | |||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||
BLASLONG i; | |||
FLOAT *a0, *a1, *a2, *a3; | |||
a0 = ap; | |||
a1 = ap + lda; | |||
a2 = a1 + lda; | |||
a3 = a2 + lda; | |||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | |||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; | |||
__vector float* va0 = (__vector float*) a0; | |||
__vector float* va1 = (__vector float*) a1; | |||
__vector float* va2 = (__vector float*) a2; | |||
__vector float* va3 = (__vector float*) a3; | |||
__vector float* v_x = (__vector float*) x; | |||
for (i = 0; i < n / 2; i+=2) { | |||
register __vector float vx_0 = v_x[i]; | |||
register __vector float vx_1 = v_x[i+1]; | |||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | |||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | |||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; | |||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; | |||
vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1]; | |||
vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; | |||
vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1]; | |||
vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; | |||
} | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | |||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; | |||
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; | |||
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; | |||
register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; | |||
register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; | |||
register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; | |||
register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; | |||
#else | |||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; | |||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; | |||
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; | |||
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; | |||
register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; | |||
register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; | |||
register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; | |||
register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; | |||
#endif | |||
#if !defined(XCONJ) | |||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; | |||
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; | |||
#else | |||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; | |||
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; | |||
#endif | |||
} | |||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||
BLASLONG i; | |||
FLOAT *a0, *a1; | |||
a0 = ap; | |||
a1 = ap + lda; | |||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | |||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; | |||
__vector float* va0 = (__vector float*) a0; | |||
__vector float* va1 = (__vector float*) a1; | |||
__vector float* v_x = (__vector float*) x; | |||
for (i = 0; i < n / 2; i+=2) { | |||
register __vector float vx_0 = v_x[i]; | |||
register __vector float vx_1 = v_x[i+1]; | |||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | |||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | |||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; | |||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; | |||
} | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | |||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; | |||
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; | |||
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; | |||
#else | |||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; | |||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; | |||
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; | |||
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; | |||
#endif | |||
#if !defined(XCONJ) | |||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
#else | |||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
#endif | |||
} | |||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||
BLASLONG i; | |||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | |||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | |||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | |||
__vector float* va0 = (__vector float*) ap; | |||
__vector float* v_x = (__vector float*) x; | |||
for (i = 0; i < n / 2; i+=2) { | |||
register __vector float vx_0 = v_x[i]; | |||
register __vector float vx_1 = v_x[i+1]; | |||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | |||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | |||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||
} | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | |||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; | |||
#else | |||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; | |||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; | |||
#endif | |||
#if !defined(XCONJ) | |||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
#else | |||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
#endif | |||
} | |||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
BLASLONG i; | |||
for (i = 0; i < n; i++) { | |||
*dest = *src; | |||
*(dest + 1) = *(src + 1); | |||
dest += 2; | |||
src += inc_src; | |||
} | |||
} | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
BLASLONG i; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT *x_ptr; | |||
FLOAT *y_ptr; | |||
BLASLONG n1; | |||
BLASLONG m1; | |||
BLASLONG m2; | |||
BLASLONG m3; | |||
BLASLONG n2; | |||
FLOAT ybuffer[8], *xbuffer; | |||
if (m < 1) return (0); | |||
if (n < 1) return (0); | |||
inc_x <<= 1; | |||
inc_y <<= 1; | |||
lda <<= 1; | |||
xbuffer = buffer; | |||
n1 = n >> 2; | |||
n2 = n & 3; | |||
m3 = m & 3; | |||
m1 = m - m3; | |||
m2 = (m & (NBMAX - 1)) - m3; | |||
BLASLONG NB = NBMAX; | |||
while (NB == NBMAX) { | |||
m1 -= NB; | |||
if (m1 < 0) { | |||
if (m2 == 0) break; | |||
NB = m2; | |||
} | |||
y_ptr = y; | |||
a_ptr = a; | |||
x_ptr = x; | |||
if (inc_x != 2) | |||
copy_x(NB, x_ptr, xbuffer, inc_x); | |||
else | |||
xbuffer = x_ptr; | |||
if (inc_y == 2) { | |||
for (i = 0; i < n1; i++) { | |||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||
a_ptr += lda << 2; | |||
y_ptr += 8; | |||
} | |||
if (n2 & 2) { | |||
cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||
a_ptr += lda << 1; | |||
y_ptr += 4; | |||
} | |||
if (n2 & 1) { | |||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||
a_ptr += lda; | |||
y_ptr += 2; | |||
} | |||
} else { | |||
for (i = 0; i < n1; i++) { | |||
memset(ybuffer, 0, sizeof (ybuffer)); | |||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); | |||
a_ptr += lda << 2; | |||
y_ptr[0] += ybuffer[0]; | |||
y_ptr[1] += ybuffer[1]; | |||
y_ptr += inc_y; | |||
y_ptr[0] += ybuffer[2]; | |||
y_ptr[1] += ybuffer[3]; | |||
y_ptr += inc_y; | |||
y_ptr[0] += ybuffer[4]; | |||
y_ptr[1] += ybuffer[5]; | |||
y_ptr += inc_y; | |||
y_ptr[0] += ybuffer[6]; | |||
y_ptr[1] += ybuffer[7]; | |||
y_ptr += inc_y; | |||
} | |||
for (i = 0; i < n2; i++) { | |||
memset(ybuffer, 0, sizeof (ybuffer)); | |||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); | |||
a_ptr += lda; | |||
y_ptr[0] += ybuffer[0]; | |||
y_ptr[1] += ybuffer[1]; | |||
y_ptr += inc_y; | |||
} | |||
} | |||
a += 2 * NB; | |||
x += NB * inc_x; | |||
} | |||
if (m3 == 0) return (0); | |||
x_ptr = x; | |||
j = 0; | |||
a_ptr = a; | |||
y_ptr = y; | |||
if (m3 == 3) { | |||
FLOAT temp_r; | |||
FLOAT temp_i; | |||
FLOAT x0 = x_ptr[0]; | |||
FLOAT x1 = x_ptr[1]; | |||
x_ptr += inc_x; | |||
FLOAT x2 = x_ptr[0]; | |||
FLOAT x3 = x_ptr[1]; | |||
x_ptr += inc_x; | |||
FLOAT x4 = x_ptr[0]; | |||
FLOAT x5 = x_ptr[1]; | |||
while (j < n) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; | |||
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; | |||
#else | |||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; | |||
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; | |||
#endif | |||
#if !defined(XCONJ) | |||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
#else | |||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
#endif | |||
a_ptr += lda; | |||
y_ptr += inc_y; | |||
j++; | |||
} | |||
return (0); | |||
} | |||
if (m3 == 2) { | |||
FLOAT temp_r; | |||
FLOAT temp_i; | |||
FLOAT temp_r1; | |||
FLOAT temp_i1; | |||
FLOAT x0 = x_ptr[0]; | |||
FLOAT x1 = x_ptr[1]; | |||
x_ptr += inc_x; | |||
FLOAT x2 = x_ptr[0]; | |||
FLOAT x3 = x_ptr[1]; | |||
while (j < (n & -2)) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
a_ptr += lda; | |||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
#else | |||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
a_ptr += lda; | |||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
#endif | |||
#if !defined(XCONJ) | |||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
#else | |||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
#endif | |||
a_ptr += lda; | |||
y_ptr += inc_y; | |||
j += 2; | |||
} | |||
while (j < n) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
#else | |||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
#endif | |||
#if !defined(XCONJ) | |||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
#else | |||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
#endif | |||
a_ptr += lda; | |||
y_ptr += inc_y; | |||
j++; | |||
} | |||
return (0); | |||
} | |||
if (m3 == 1) { | |||
FLOAT temp_r; | |||
FLOAT temp_i; | |||
FLOAT temp_r1; | |||
FLOAT temp_i1; | |||
FLOAT x0 = x_ptr[0]; | |||
FLOAT x1 = x_ptr[1]; | |||
while (j < (n & -2)) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
a_ptr += lda; | |||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
#else | |||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
a_ptr += lda; | |||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
#endif | |||
#if !defined(XCONJ) | |||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
#else | |||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
y_ptr += inc_y; | |||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
#endif | |||
a_ptr += lda; | |||
y_ptr += inc_y; | |||
j += 2; | |||
} | |||
while (j < n) { | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
#else | |||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
#endif | |||
#if !defined(XCONJ) | |||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
#else | |||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
#endif | |||
a_ptr += lda; | |||
y_ptr += inc_y; | |||
j++; | |||
} | |||
return (0); | |||
} | |||
return (0); | |||
} | |||
@@ -0,0 +1,231 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2018, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#if defined(POWER8) || defined(POWER9) | |||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
{ | |||
__vector float t0; | |||
__vector float t1; | |||
__vector float t2; | |||
__vector float t3; | |||
__vector float t4; | |||
__vector float t5; | |||
__vector float t6; | |||
__vector float t7; | |||
__asm__ | |||
( | |||
"xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
"xxspltw 36, 36, 0 \n\t" | |||
"xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
"xxspltw 37, 37, 0 \n\t" | |||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
"addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||
"ble 2f \n\t" | |||
".p2align 5 \n\t" | |||
"1: \n\t" | |||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||
"xvmulsp 41, 33, 36 \n\t" | |||
"xvmulsp 42, 34, 36 \n\t" | |||
"xvmulsp 43, 35, 36 \n\t" | |||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
"xvmulsp %x[x2], 49, 36 \n\t" | |||
"xvmulsp %x[x1], 50, 36 \n\t" | |||
"xvmulsp %x[x3], 51, 36 \n\t" | |||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||
"xvmulsp 45, 33, 37 \n\t" | |||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
"lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
"xvmulsp 46, 34, 37 \n\t" | |||
"xvmulsp 47, 35, 37 \n\t" | |||
"lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
"lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
"xvmulsp %x[x5], 49, 37 \n\t" | |||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
"lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
"xvmulsp %x[x6], 50, 37 \n\t" | |||
"xvmulsp %x[x7], 51, 37 \n\t" | |||
"lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
"lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
"addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
"addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
"addic. %[temp_n], %[temp_n], -8 \n\t" | |||
"bgt 1b \n\t" | |||
"2: \n\t" | |||
"xvmulsp 40, 32, 36 \n\t" // c * x | |||
"xvmulsp 41, 33, 36 \n\t" | |||
"xvmulsp 42, 34, 36 \n\t" | |||
"xvmulsp 43, 35, 36 \n\t" | |||
"xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
"xvmulsp %x[x2], 49, 36 \n\t" | |||
"xvmulsp %x[x1], 50, 36 \n\t" | |||
"xvmulsp %x[x3], 51, 36 \n\t" | |||
"xvmulsp 44, 32, 37 \n\t" // s * x | |||
"xvmulsp 45, 33, 37 \n\t" | |||
"xvmulsp 46, 34, 37 \n\t" | |||
"xvmulsp 47, 35, 37 \n\t" | |||
"xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
"xvmulsp %x[x5], 49, 37 \n\t" | |||
"xvmulsp %x[x6], 50, 37 \n\t" | |||
"xvmulsp %x[x7], 51, 37 \n\t" | |||
"addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
"addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
"xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
"xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
"xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
"xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
"xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
"xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
"xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
"xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
"stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
"stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
"stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
"stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
"stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
"stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
: | |||
[mem_x] "+m" (*(float (*)[2*n])x), | |||
[mem_y] "+m" (*(float (*)[2*n])y), | |||
[temp_n] "+r" (n), | |||
[x_ptr] "+&b" (x), | |||
[y_ptr] "+&b" (y), | |||
[x0] "=wa" (t0), | |||
[x1] "=wa" (t2), | |||
[x2] "=wa" (t1), | |||
[x3] "=wa" (t3), | |||
[x4] "=wa" (t4), | |||
[x5] "=wa" (t5), | |||
[x6] "=wa" (t6), | |||
[x7] "=wa" (t7) | |||
: | |||
[cos] "f" (c), | |||
[sin] "f" (s), | |||
[i16] "b" (16), | |||
[i32] "b" (32), | |||
[i48] "b" (48) | |||
: | |||
"cr0", | |||
"vs32","vs33","vs34","vs35","vs36","vs37", | |||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
"vs48","vs49","vs50","vs51" | |||
); | |||
} | |||
#endif | |||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0,iy=0; | |||
FLOAT temp[2]; | |||
BLASLONG inc_x2; | |||
BLASLONG inc_y2; | |||
if ( n <= 0 ) return(0); | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
BLASLONG n1 = n & -8; | |||
if ( n1 > 0 ) | |||
{ | |||
crot_kernel_8(n1, x, y, c, s); | |||
i=n1; | |||
ix=2*n1; | |||
} | |||
while(i < n) | |||
{ | |||
temp[0] = c*x[ix] + s*y[ix] ; | |||
temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||
y[ix] = c*y[ix] - s*x[ix] ; | |||
y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||
x[ix] = temp[0] ; | |||
x[ix+1] = temp[1] ; | |||
ix += 2 ; | |||
i++ ; | |||
} | |||
} | |||
else | |||
{ | |||
inc_x2 = 2 * inc_x ; | |||
inc_y2 = 2 * inc_y ; | |||
while(i < n) | |||
{ | |||
temp[0] = c*x[ix] + s*y[iy] ; | |||
temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
y[iy] = c*y[iy] - s*x[ix] ; | |||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
x[ix] = temp[0] ; | |||
x[ix+1] = temp[1] ; | |||
ix += inc_x2 ; | |||
iy += inc_y2 ; | |||
i++ ; | |||
} | |||
} | |||
return(0); | |||
} | |||
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include "common.h" | |||
#if defined(POWER8) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "cswap_microk_power8.c" | |||
#endif | |||
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
#if defined(POWER8) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "dasum_microk_power8.c" | |||
#endif | |||
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include "common.h" | |||
#if defined(POWER8) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "daxpy_microk_power8.c" | |||
#endif | |||
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include "common.h" | |||
#if defined(POWER8) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "dcopy_microk_power8.c" | |||
#endif | |||
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include "common.h" | |||
#if defined(POWER8) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "ddot_microk_power8.c" | |||
#endif | |||
@@ -0,0 +1,249 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2019, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "def_vsx.h" | |||
#define LOAD ld | |||
#define STACKSIZE (512 ) | |||
#define ALPHA_SP (296+192)(SP) | |||
#define FZERO (304+192)(SP) | |||
#define M r3 | |||
#define N r4 | |||
#define K r5 | |||
#define A r7 | |||
#define B r8 | |||
#define C r9 | |||
#define LDC r10 | |||
#define OFFSET r6 | |||
#define alpha_r vs18 | |||
#define o0 0 | |||
#define T4 r12 | |||
#define T3 r11 | |||
#define C4 r14 | |||
#define o8 r15 | |||
#define o24 r16 | |||
#define C2 r17 | |||
#define L r18 | |||
#define T1 r19 | |||
#define C3 r20 | |||
#define TEMP_REG r21 | |||
#define I r22 | |||
#define J r23 | |||
#define AO r24 | |||
#define BO r25 | |||
#define CO r26 | |||
#define o16 r27 | |||
#define o32 r28 | |||
#define o48 r29 | |||
#define PRE r30 | |||
#define T2 r31 | |||
#include "dgemm_macros_power9.S" | |||
#ifndef NEEDPARAM | |||
PROLOGUE | |||
PROFCODE | |||
addi SP, SP, -STACKSIZE | |||
li r0, 0 | |||
stfd f14, 0(SP) | |||
stfd f15, 8(SP) | |||
stfd f16, 16(SP) | |||
stfd f17, 24(SP) | |||
stfd f18, 32(SP) | |||
stfd f19, 40(SP) | |||
stfd f20, 48(SP) | |||
stfd f21, 56(SP) | |||
stfd f22, 64(SP) | |||
stfd f23, 72(SP) | |||
stfd f24, 80(SP) | |||
stfd f25, 88(SP) | |||
stfd f26, 96(SP) | |||
stfd f27, 104(SP) | |||
stfd f28, 112(SP) | |||
stfd f29, 120(SP) | |||
stfd f30, 128(SP) | |||
stfd f31, 136(SP) | |||
std r31, 144(SP) | |||
std r30, 152(SP) | |||
std r29, 160(SP) | |||
std r28, 168(SP) | |||
std r27, 176(SP) | |||
std r26, 184(SP) | |||
std r25, 192(SP) | |||
std r24, 200(SP) | |||
std r23, 208(SP) | |||
std r22, 216(SP) | |||
std r21, 224(SP) | |||
std r20, 232(SP) | |||
std r19, 240(SP) | |||
std r18, 248(SP) | |||
std r17, 256(SP) | |||
std r16, 264(SP) | |||
std r15, 272(SP) | |||
std r14, 280(SP) | |||
stxv v20, 288(SP) | |||
stxv v21, 304(SP) | |||
stxv v22, 320(SP) | |||
stxv v23, 336(SP) | |||
stxv v24, 352(SP) | |||
stxv v25, 368(SP) | |||
stxv v26, 384(SP) | |||
stxv v27, 400(SP) | |||
stxv v28, 416(SP) | |||
stxv v29, 432(SP) | |||
stxv v30, 448(SP) | |||
stxv v31, 464(SP) | |||
stfd f1, ALPHA_SP | |||
stw r0, FZERO | |||
slwi LDC, LDC, BASE_SHIFT | |||
#if defined(TRMMKERNEL) | |||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
#endif | |||
cmpwi cr0, M, 0 | |||
ble .L999_H1 | |||
cmpwi cr0, N, 0 | |||
ble .L999_H1 | |||
cmpwi cr0, K, 0 | |||
ble .L999_H1 | |||
addi T1, SP, 296+192 | |||
li PRE, 384 | |||
li o8 , 8 | |||
li o16, 16 | |||
li o24, 24 | |||
li o32, 32 | |||
li o48, 48 | |||
lxvdsx alpha_r, 0, T1 | |||
#include "dgemm_logic_power9.S" | |||
.L999: | |||
addi r3, 0, 0 | |||
lfd f14, 0(SP) | |||
lfd f15, 8(SP) | |||
lfd f16, 16(SP) | |||
lfd f17, 24(SP) | |||
lfd f18, 32(SP) | |||
lfd f19, 40(SP) | |||
lfd f20, 48(SP) | |||
lfd f21, 56(SP) | |||
lfd f22, 64(SP) | |||
lfd f23, 72(SP) | |||
lfd f24, 80(SP) | |||
lfd f25, 88(SP) | |||
lfd f26, 96(SP) | |||
lfd f27, 104(SP) | |||
lfd f28, 112(SP) | |||
lfd f29, 120(SP) | |||
lfd f30, 128(SP) | |||
lfd f31, 136(SP) | |||
ld r31, 144(SP) | |||
ld r30, 152(SP) | |||
ld r29, 160(SP) | |||
ld r28, 168(SP) | |||
ld r27, 176(SP) | |||
ld r26, 184(SP) | |||
ld r25, 192(SP) | |||
ld r24, 200(SP) | |||
ld r23, 208(SP) | |||
ld r22, 216(SP) | |||
ld r21, 224(SP) | |||
ld r20, 232(SP) | |||
ld r19, 240(SP) | |||
ld r18, 248(SP) | |||
ld r17, 256(SP) | |||
ld r16, 264(SP) | |||
ld r15, 272(SP) | |||
ld r14, 280(SP) | |||
lxv v20, 288(SP) | |||
lxv v21, 304(SP) | |||
lxv v22, 320(SP) | |||
lxv v23, 336(SP) | |||
lxv v24, 352(SP) | |||
lxv v25, 368(SP) | |||
lxv v26, 384(SP) | |||
lxv v27, 400(SP) | |||
lxv v28, 416(SP) | |||
lxv v29, 432(SP) | |||
lxv v30, 448(SP) | |||
lxv v31, 464(SP) | |||
addi SP, SP, STACKSIZE | |||
blr | |||
EPILOGUE | |||
#endif |
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include "common.h" | |||
#if defined(POWER8) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "dgemv_n_microk_power8.c" | |||
#endif | |||