@@ -125,9 +125,9 @@ task: | |||
- make USE_OPENMP=1 | |||
FreeBSD_task: | |||
name: FreeBSD-gcc12 | |||
name: FreeBSD-gcc | |||
freebsd_instance: | |||
image_family: freebsd-13-3 | |||
image_family: freebsd-14-2 | |||
install_script: | |||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
compile_script: | |||
@@ -136,9 +136,9 @@ FreeBSD_task: | |||
FreeBSD_task: | |||
name: freebsd-gcc12-ilp64 | |||
name: freebsd-gcc-ilp64 | |||
freebsd_instance: | |||
image_family: freebsd-13-3 | |||
image_family: freebsd-14-2 | |||
install_script: | |||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
compile_script: | |||
@@ -148,7 +148,7 @@ FreeBSD_task: | |||
FreeBSD_task: | |||
name: FreeBSD-clang-openmp | |||
freebsd_instance: | |||
image_family: freebsd-13-3 | |||
image_family: freebsd-14-2 | |||
install_script: | |||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
- ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so | |||
@@ -37,7 +37,7 @@ jobs: | |||
run: | | |||
sudo apt-get update | |||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | |||
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross | |||
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev | |||
- name: checkout qemu | |||
uses: actions/checkout@v3 | |||
@@ -52,6 +52,7 @@ jobs: | |||
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
cd qemu | |||
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error" | |||
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system | |||
make -j$(nproc) | |||
make install | |||
@@ -15,7 +15,7 @@ jobs: | |||
strategy: | |||
fail-fast: false | |||
matrix: | |||
os: [ubuntu-latest] | |||
os: [ubuntu-22.04] | |||
fortran: [gfortran] | |||
build: [make] | |||
pyver: ["3.12"] | |||
@@ -147,7 +147,7 @@ jobs: | |||
OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd' | |||
- name: Run benchmarks | |||
uses: CodSpeedHQ/action@v2 | |||
uses: CodSpeedHQ/action@v3 | |||
with: | |||
token: ${{ secrets.CODSPEED_TOKEN }} | |||
run: | | |||
@@ -23,7 +23,7 @@ jobs: | |||
python-version: "3.10" | |||
- name: Install MkDocs and doc theme packages | |||
run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin | |||
run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin mkdocs-mermaid2-plugin | |||
- name: Build docs site | |||
run: mkdocs build | |||
@@ -43,7 +43,9 @@ jobs: | |||
run: | | |||
if [ "$RUNNER_OS" == "Linux" ]; then | |||
sudo apt-get update | |||
sudo apt-get install -y gfortran cmake ccache libtinfo5 | |||
sudo apt-get install -y gfortran cmake ccache | |||
wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb | |||
sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb | |||
elif [ "$RUNNER_OS" == "macOS" ]; then | |||
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | |||
brew reinstall gcc | |||
@@ -158,7 +160,7 @@ jobs: | |||
strategy: | |||
fail-fast: false | |||
matrix: | |||
msystem: [UCRT64, MINGW32, CLANG64, CLANG32] | |||
msystem: [UCRT64, MINGW32, CLANG64] | |||
idx: [int32, int64] | |||
build-type: [Release] | |||
include: | |||
@@ -174,14 +176,6 @@ jobs: | |||
idx: int32 | |||
target-prefix: mingw-w64-clang-x86_64 | |||
fc-pkg: fc | |||
# Compiling with Flang 16 seems to cause test errors on machines | |||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. | |||
no-avx512-flags: -DNO_AVX512=1 | |||
- msystem: CLANG32 | |||
idx: int32 | |||
target-prefix: mingw-w64-clang-i686 | |||
fc-pkg: cc | |||
c-lapack-flags: -DC_LAPACK=ON | |||
- msystem: UCRT64 | |||
idx: int64 | |||
idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
@@ -192,9 +186,6 @@ jobs: | |||
idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
target-prefix: mingw-w64-clang-x86_64 | |||
fc-pkg: fc | |||
# Compiling with Flang 16 seems to cause test errors on machines | |||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. | |||
no-avx512-flags: -DNO_AVX512=1 | |||
- msystem: UCRT64 | |||
idx: int32 | |||
target-prefix: mingw-w64-ucrt-x86_64 | |||
@@ -203,8 +194,6 @@ jobs: | |||
exclude: | |||
- msystem: MINGW32 | |||
idx: int64 | |||
- msystem: CLANG32 | |||
idx: int64 | |||
defaults: | |||
run: | |||
@@ -280,8 +269,6 @@ jobs: | |||
-DNUM_THREADS=64 \ | |||
-DTARGET=CORE2 \ | |||
${{ matrix.idx64-flags }} \ | |||
${{ matrix.c-lapack-flags }} \ | |||
${{ matrix.no-avx512-flags }} \ | |||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \ | |||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ | |||
.. | |||
@@ -369,3 +356,23 @@ jobs: | |||
- name: Build OpenBLAS | |||
run: | | |||
make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }} | |||
neoverse_build: | |||
if: "github.repository == 'OpenMathLib/OpenBLAS'" | |||
runs-on: ubuntu-24.04-arm | |||
steps: | |||
- name: Checkout repository | |||
uses: actions/checkout@v3 | |||
- name: Install Dependencies | |||
run: | | |||
sudo apt-get update | |||
sudo apt-get install -y gcc gfortran make | |||
- name: Build OpenBLAS | |||
run: | | |||
make -j${nproc} TARGET=NEOVERSEN2 | |||
make -j${nproc} TARGET=NEOVERSEN2 lapack-test | |||
@@ -0,0 +1,37 @@ | |||
name: harmonyos | |||
on: [push, pull_request] | |||
concurrency: | |||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | |||
cancel-in-progress: true | |||
permissions: | |||
contents: read # to fetch code (actions/checkout) | |||
jobs: | |||
build: | |||
if: "github.repository == 'OpenMathLib/OpenBLAS'" | |||
runs-on: ubuntu-latest | |||
env: | |||
OHOS_NDK_CMAKE: $GITHUB_WORKSPACE/ohos-sdk/linux/native/build-tools/cmake/bin/cmake | |||
COMMON_CMAKE_OPTIONS: | | |||
-DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \ | |||
-DCMAKE_INSTALL_PREFIX=install \ | |||
-DCMAKE_BUILD_TYPE=Release \ | |||
steps: | |||
- uses: actions/checkout@v4 | |||
- name: ndk-install | |||
run: | | |||
wget https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz | |||
tar -xf ohos-sdk-windows_linux-public.tar.gz | |||
cd ohos-sdk/linux | |||
unzip -q native-linux-x64-4.1.7.8-Release.zip | |||
cd - | |||
- name: build-armv8 | |||
run: | | |||
mkdir build && cd build | |||
${{ env.OHOS_NDK_CMAKE }} ${{ env.COMMON_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" \ | |||
-DTARGET=ARMV8 -DNOFORTRAN=1 .. | |||
${{ env.OHOS_NDK_CMAKE }} --build . -j $(nproc) | |||
@@ -41,7 +41,7 @@ jobs: | |||
- name: Install APT deps | |||
run: | | |||
sudo apt-get update | |||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache | |||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache libglib2.0-dev | |||
- name: Download and install loongarch64-toolchain | |||
run: | | |||
@@ -41,14 +41,14 @@ jobs: | |||
run: | | |||
sudo apt-get update | |||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | |||
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross | |||
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross libglib2.0-dev | |||
- name: checkout qemu | |||
uses: actions/checkout@v3 | |||
with: | |||
repository: qemu/qemu | |||
path: qemu | |||
ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2 | |||
ref: ae35f033b874c627d81d51070187fbf55f0bf1a7 | |||
- name: build qemu | |||
run: | | |||
@@ -229,3 +229,16 @@ In chronological order: | |||
* Christopher Daley <https://github.com/cdaley> | |||
* [2024-01-24] Optimize GEMV forwarding on ARM64 systems | |||
* Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32> | |||
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE | |||
* Annop Wongwathanarat <annop.wongwathanarat@arm.com> | |||
* [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 | |||
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel | |||
* Marek Michalowski <https://github.com/michalowski-arm> | |||
* [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` | |||
* Ye Tao <ye.tao@arm.com> | |||
* [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 |
@@ -1,4 +1,99 @@ | |||
OpenBLAS ChangeLog | |||
==================================================================== | |||
Version 0.3.29 | |||
12-Jan-2025 | |||
general: | |||
- fixed a potential NULL pointer dereference in multithreaded builds | |||
- added function aliases for GEMMT using its new name GEMMTR adopted by Reference-BLAS | |||
- fixed a build failure when building without LAPACK_DEPRECATED functions | |||
- the minimum required CMake version for CMake-based builds was raised to 3.16.0 in order | |||
to remove many compatibility and deprecation warnings | |||
- added more detailed CMake rules for OpenMP builds (mainly to support recent LLVM) | |||
- fixed the behavior of the recently added CBLAS_?GEMMT functions with row-major data | |||
- improved thread scaling of multithreaded SBGEMV | |||
- improved thread scaling of multithreaded TRTRI | |||
- fixed compilation of the CBLAS testsuite with gcc14 (and no Fortran compiler) | |||
- added support for option handling changes in flang-new from LLVM18 onwards | |||
- added support for recent calling conventions changes in Cray and NVIDIA compilers | |||
- added support for compilation with the NAG Fortran compiler | |||
- fixed placement of the -fopenmp flag and libsuffix in the generated pkgconfig file | |||
- improved the CMakeConfig file generated by the Makefile build | |||
- fixed const-correctness of cblas_?geadd in cblas.h | |||
- fixed a potential inaccuracy in multithreaded BLAS3 calls | |||
- fixed empty implementations of get/set_affinity that print a warning in OpenMP builds | |||
- fixed function signatures for TRTRS in the converted C version of LAPACK | |||
- fixed omission of several single-precision LAPACK symbols in the shared library | |||
- improved build instructions for the provided "pybench" benchmarks | |||
- improved documentation, including added build instructions for WoA and HarmonyOS | |||
as well as descriptions of environment variables that affect build and runtime behavior | |||
- added a separate "make install_tests" target for use with cross-compilations | |||
- integrated improvements and corrections from Reference-LAPACK: | |||
- removed a comparison in LAPACKE ?tpmqrt that is always false (LAPACK PR 1062) | |||
- fixed the leading dimension for B in tests for GGEV (LAPACK PR 1064) | |||
- replaced the ?LARFT functions with a recursive implementation (LAPACK PR 1080) | |||
arm: | |||
- fixed build with recent versions of the NDK (missing .type declaration of symbols) | |||
arm64: | |||
- fixed a long-standing bug in the (generic) c/zgemm_beta kernel that could lead to | |||
reads and writes outside the array bounds in some circumstances | |||
- rewrote cpu autodetection to scan all cores and return the highest performing type | |||
- improved the DGEMM performance for SVE targets and small matrix sizes | |||
- improved dimension criteria for forwarding from GEMM to GEMV kernels | |||
- added SVE kernels for ROT and SWAP | |||
- improved SVE kernels for SGEMV and DGEMV on A64FX and NEOVERSEV1 | |||
- added support for using the "small matrix" kernels with CMake as well | |||
- fixed compilation on Windows on Arm | |||
- improved compile-time detection of SVE capability | |||
- added cpu autodetection and initial support for Apple M4 | |||
- added support for compilation on systems running IOS | |||
- added support for compilation on NetBSD ("evbarm" architecture) | |||
- fixed NRM2 implementations for generic SVE targets and the Neoverse N2 | |||
- fixed compilation for SVE-capable targets with the NVIDIA compiler | |||
x86_64: | |||
- fixed a wrong storage size in the SBGEMV kernel for Cooper Lake | |||
- added cpu autodetection for Intel Granite Rapids | |||
- added cpu autodetection for AMD Ryzen 5 series | |||
- added optimized SOMATCOPY_CT for AVX-capable targets | |||
- fixed the fallback implementation of GEMM3M in GENERIC builds | |||
- tentatively re-enabled builds with the EXPRECISION option | |||
- worked around a miscompilation of tests with mingw32-gfortran14 | |||
- added support for compilation with the Intel oneAPI 2025.0 compiler on Windows | |||
power: | |||
- fixed multithreaded SBGEMM | |||
- fixed a CMake build problem on POWER10 | |||
- improved the performance of SGEMV | |||
- added vectorized implementations of SBGEMV and support for forwarding 1xN SBGEMM to them | |||
- fixed illegal instructions and potential memory overflow in SGEMM on PPCG4 | |||
- fixed handling of NaN and Inf arguments in SSCAL and DSCAL on PPC440,G4 and 970 | |||
- added improved CGEMM and ZGEMM kernels for POWER10 | |||
- added Makefile logic to remove all optimization flags in DEBUG builds | |||
mips64: | |||
- fixed compilation with gcc14 | |||
- fixed GEMM parameter selection for the MIPS64_GENERIC target | |||
- fixed a potential build failure when compiling with OpenMP | |||
loongarch64: | |||
- fixed compilation for Loongson3 with recent versions of gmake | |||
- fixed a potential loss of precision in Loongson3A GEMM | |||
- fixed a potential build failure when compiling with OpenMP | |||
- added optimized SOMATCOPY for LASX-capable targets | |||
- introduced a new cpu naming scheme while retaining compatibility | |||
- added support for cross-compiling Loongarch64 targets with CMake | |||
- added support for compilation with LLVM | |||
riscv64: | |||
- removed thread yielding overhead caused by sched_yield | |||
- replaced some non-standard intrinsics with their official names | |||
- fixed and sped up the implementations of CGEMM/ZGEMM TCOPY for vector lenghts 128 and 256 | |||
- improved the performance of SNRM2/DNRM2 for RVV1.0 targets | |||
- added optimized ?OMATCOPY_CN kernels for RVV1.0 targets | |||
==================================================================== | |||
Version 0.3.28 | |||
8-Aug-2024 | |||
@@ -426,6 +426,9 @@ dummy : | |||
install : | |||
$(MAKE) -f Makefile.install install | |||
install_tests : | |||
$(MAKE) -f Makefile.install install_tests | |||
clean :: | |||
@for d in $(SUBDIRS_ALL) ; \ | |||
do if test -d $$d; then \ | |||
@@ -106,7 +106,7 @@ ifeq ($(CORE), NEOVERSEV1) | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) | |||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
CCOMMON_OPT += -march=armv8.4-a+sve | |||
CCOMMON_OPT += -march=armv8.4-a+sve+bf16 | |||
ifeq (1, $(ISCLANG)) | |||
CCOMMON_OPT += -mtune=cortex-x1 | |||
else | |||
@@ -116,7 +116,7 @@ ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
endif | |||
else | |||
CCOMMON_OPT += -march=armv8.4-a+sve | |||
CCOMMON_OPT += -march=armv8.4-a+sve+bf16 | |||
ifneq ($(CROSS), 1) | |||
CCOMMON_OPT += -mtune=native | |||
endif | |||
@@ -356,4 +356,31 @@ endif | |||
endif | |||
else | |||
# NVIDIA HPC options necessary to enable SVE in the compiler | |||
ifeq ($(CORE), THUNDERX2T99) | |||
CCOMMON_OPT += -tp=thunderx2t99 | |||
FCOMMON_OPT += -tp=thunderx2t99 | |||
endif | |||
ifeq ($(CORE), NEOVERSEN1) | |||
CCOMMON_OPT += -tp=neoverse-n1 | |||
FCOMMON_OPT += -tp=neoverse-n1 | |||
endif | |||
ifeq ($(CORE), NEOVERSEV1) | |||
CCOMMON_OPT += -tp=neoverse-v1 | |||
FCOMMON_OPT += -tp=neoverse-v1 | |||
endif | |||
ifeq ($(CORE), NEOVERSEV2) | |||
CCOMMON_OPT += -tp=neoverse-v2 | |||
FCOMMON_OPT += -tp=neoverse-v2 | |||
endif | |||
ifeq ($(CORE), ARMV8SVE) | |||
CCOMMON_OPT += -tp=neoverse-v2 | |||
FCOMMON_OPT += -tp=neoverse-v2 | |||
endif | |||
ifeq ($(CORE), ARMV9SVE) | |||
CCOMMON_OPT += -tp=neoverse-v2 | |||
FCOMMON_OPT += -tp=neoverse-v2 | |||
endif | |||
endif |
@@ -191,22 +191,29 @@ endif | |||
#Generating OpenBLASConfig.cmake | |||
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_INCLUDE_DIRS \$${_OpenBLAS_ROOT_DIR}/include)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
ifneq ($(NO_SHARED),1) | |||
#ifeq logical or | |||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) | |||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
endif | |||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | |||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/bin/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
endif | |||
ifeq ($(OSNAME), Darwin) | |||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
endif | |||
@echo "add_library(OpenBLAS::OpenBLAS SHARED IMPORTED)" | |||
@echo "target_include_directories(OpenBLAS::OpenBLAS INTERFACE \$${OpenBLAS_INCLUDE_DIRS})" | |||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | |||
@echo "set_property(TARGET OpenBLAS::OpenBLAS PROPERTY IMPORTED_LOCATION \$${OpenBLAS_LIBRARIES})" | |||
@echo "set_property(TARGET OpenBLAS::OpenBLAS PROPERTY IMPORTED_IMPLIB \$${_OpenBLAS_ROOT_DIR}/lib/libopenblas.lib)" | |||
endif | |||
else | |||
#only static | |||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
endif | |||
#Generating OpenBLASConfigVersion.cmake | |||
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
@@ -220,3 +227,96 @@ endif | |||
@echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
@echo Install OK! | |||
install_tests : lib.grd | |||
ifneq ($(ONLY_CBLAS), 1) | |||
@install -m 666 utest/openblas_utest $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 utest/openblas_utest_ext $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
ifndef NO_FBLAS | |||
ifeq ($(BUILD_BFLOAT16),1) | |||
@install -m 666 test/test_sbgemm $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
ifeq ($(BUILD_SINGLE),1) | |||
@install -m 666 test/sblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/sblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/sblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/sblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/sblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
ifeq ($(BUILD_DOUBLE),1) | |||
@install -m 666 test/dblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/dblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/dblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/dblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/dblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
ifeq ($(BUILD_COMPLEX),1) | |||
@install -m 666 test/cblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/cblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/cblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/cblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/cblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS)) | |||
@install -m 666 test/cblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/cblat3_3m.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
endif | |||
ifeq ($(BUILD_COMPLEX16),1) | |||
@install -m 666 test/zblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/zblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/zblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/zblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/zblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS)) | |||
@install -m 666 test/zblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 test/zblat3_3m.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
endif | |||
endif | |||
endif | |||
ifneq ($(ONLY_CBLAS), 1) | |||
ifeq ($(BUILD_SINGLE),1) | |||
@install -m 666 ctest/xscblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/xscblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/xscblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/sin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/sin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
ifeq ($(BUILD_DOUBLE),1) | |||
@install -m 666 ctest/xdcblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/xdcblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/xdcblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/din2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/din3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
ifeq ($(BUILD_COMPLEX),1) | |||
@install -m 666 ctest/xccblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/xccblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/xccblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/cin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/cin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS)) | |||
@install -m 666 ctest/xccblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/cin3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
endif | |||
ifeq ($(BUILD_COMPLEX16),1) | |||
@install -m 666 ctest/xzcblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/xzcblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/xzcblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/zin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/zin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS)) | |||
@install -m 666 ctest/xzcblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 ctest/zin3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
endif | |||
endif | |||
ifeq ($(CPP_THREAD_SAFETY_TEST), 1) | |||
@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
endif | |||
@@ -3,7 +3,7 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 | |||
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static | |||
endif | |||
ifeq ($(CORE), x280) | |||
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math | |||
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d | |||
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | |||
endif | |||
ifeq ($(CORE), RISCV64_ZVL256B) | |||
@@ -3,7 +3,7 @@ | |||
# | |||
# This library's version | |||
VERSION = 0.3.28.dev | |||
VERSION = 0.3.29.dev | |||
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a | |||
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library | |||
@@ -447,7 +447,7 @@ endif | |||
ifeq ($(OSNAME), Linux) | |||
EXTRALIB += -lm | |||
NO_EXPRECISION = 1 | |||
#NO_EXPRECISION = 1 | |||
endif | |||
ifeq ($(OSNAME), Android) | |||
@@ -573,7 +573,7 @@ NO_BINARY_MODE = 1 | |||
endif | |||
ifeq ($(CORE), generic) | |||
NO_EXPRECISION = 1 | |||
#NO_EXPRECISION = 1 | |||
endif | |||
ifndef NO_EXPRECISION | |||
@@ -596,7 +596,7 @@ endif | |||
ifeq ($(ARCH), x86_64) | |||
ifeq ($(CORE), generic) | |||
NO_EXPRECISION = 1 | |||
#NO_EXPRECISION = 1 | |||
endif | |||
ifndef NO_EXPRECISION | |||
@@ -832,8 +832,8 @@ BINARY_DEFINED = 1 | |||
ifeq ($(F_COMPILER), GFORTRAN) | |||
ifeq ($(C_COMPILER), GCC) | |||
# EXPRECISION = 1 | |||
# CCOMMON_OPT += -DEXPRECISION | |||
EXPRECISION = 1 | |||
CCOMMON_OPT += -DEXPRECISION | |||
endif | |||
endif | |||
endif | |||
@@ -1396,17 +1396,15 @@ endif | |||
endif | |||
ifeq ($(F_COMPILER), CRAY) | |||
CCOMMON_OPT += -DF_INTERFACE_CRAYFC | |||
CCOMMON_OPT += -DF_INTERFACE_INTEL | |||
FCOMMON_OPT += -hnopattern | |||
ifdef INTERFACE64 | |||
ifneq ($(INTERFACE64), 0) | |||
FCOMMON_OPT += -s integer64 | |||
endif | |||
endif | |||
ifeq ($(USE_OPENMP), 1) | |||
FCOMMON_OPT += -fopenmp | |||
else | |||
FCOMMON_OPT += -fno-openmp | |||
ifneq ($(USE_OPENMP), 1) | |||
FCOMMON_OPT += -O noomp | |||
endif | |||
endif | |||
@@ -15,11 +15,14 @@ OSUOSL IBMZ-CI [ library based on GotoBLAS2 1.13 BSD version. | |||
Please read the documentation in the OpenBLAS folder: <https://github.com/OpenMathLib/OpenBLAS/docs>. | |||
For more information about OpenBLAS, please see: | |||
- The documentation at [openmathlib.org/OpenBLAS/docs/](http://www.openmathlib.org/OpenBLAS/docs), | |||
- The home page at [openmathlib.org/OpenBLAS/](http://www.openmathlib.org/OpenBLAS). | |||
For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: | |||
<https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six | |||
20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful. | |||
20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare [here](https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/) or YouTube [here](https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek) may be helpful. | |||
## Binary Packages | |||
@@ -27,24 +30,29 @@ We provide official binary packages for the following platform: | |||
* Windows x86/x86_64 | |||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/OpenMathLib/OpenBLAS/releases](https://github.com/OpenMathLib/OpenBLAS/releases). | |||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the [Releases section of the GitHub project page](https://github.com/OpenMathLib/OpenBLAS/releases). | |||
OpenBLAS is also packaged for many package managers - see [the installation section of the docs](http://www.openmathlib.org/OpenBLAS/docs/install/) for details. | |||
## Installation from Source | |||
Download from project homepage, https://github.com/OpenMathLib/OpenBLAS/, or check out the code | |||
using Git from https://github.com/OpenMathLib/OpenBLAS.git. (If you want the most up to date version, be | |||
sure to use the develop branch - master is several years out of date due to a change of maintainership.) | |||
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. | |||
Most can also be given directly on the make or cmake command line. | |||
Obtain the source code from https://github.com/OpenMathLib/OpenBLAS/. Note that the default branch | |||
is `develop` (a `master` branch is still present, but far out of date). | |||
Build-time parameters can be chosen in `Makefile.rule`, see there for a short description of each option. | |||
Most options can also be given directly on the command line as parameters to your `make` or `cmake` invocation. | |||
### Dependencies | |||
Building OpenBLAS requires the following to be installed: | |||
* GNU Make or CMake | |||
* A C compiler, e.g. GCC or Clang | |||
* A C compiler, e.g. GCC or Clang | |||
* A Fortran compiler (optional, for LAPACK) | |||
In general, using a recent version of the compiler is strongly recommended. | |||
If a Fortran compiler is not available, it is possible to compile an older version of the included LAPACK | |||
that has been machine-translated to C. | |||
### Normal compile | |||
@@ -60,6 +68,9 @@ For building with `cmake`, the usual conventions apply, i.e. create a build dire | |||
OpenBLAS source directory or separate from it, and invoke `cmake` there with the path to the source tree and any | |||
build options you plan to set. | |||
For more details, see the [Building from source](http://www.openmathlib.org/OpenBLAS/docs/install/#building-from-source) | |||
section in the docs. | |||
### Cross compile | |||
Set `CC` and `FC` to point to the cross toolchains, and if you use `make`, also set `HOSTCC` to your host C compiler. | |||
@@ -76,10 +87,12 @@ Examples: | |||
make CC="i686-w64-mingw32-gcc -Bstatic" FC="i686-w64-mingw32-gfortran -static-libgfortran" TARGET=HASWELL BINARY=32 CROSS=1 NUM_THREADS=20 CONSISTENT_FPCSR=1 HOSTCC=gcc | |||
``` | |||
You can find instructions for other cases both in the "Supported Systems" section below and in the docs folder. The .yml scripts included with the sources (which contain the | |||
You can find instructions for other cases both in the "Supported Systems" section below and in | |||
the [Building from source docs](http://www.openmathlib.org/OpenBLAS/docs/install). | |||
The `.yml` scripts included with the sources (which contain the | |||
build scripts for the "continuous integration" (CI) build tests automatically run on every proposed change to the sources) may also provide additional hints. | |||
When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build. | |||
When compiling for a more modern CPU target of the same architecture, e.g. `TARGET=SKYLAKEX` on a `HASWELL` host, option `CROSS=1` can be used to suppress the automatic invocation of the tests at the end of the build. | |||
### Debug version | |||
@@ -325,11 +338,14 @@ Please see Changelog.txt. | |||
## Troubleshooting | |||
* Please read the [FAQ](https://github.com/OpenMathLib/OpenBLAS/docs/faq,md) in the docs folder first. | |||
* Please read the [FAQ](http://www.openmathlib.org/OpenBLAS/docs/faq) section of the docs first. | |||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. | |||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. | |||
Clang 3.0 will generate the wrong AVX binary code. | |||
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels. | |||
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake/CooperLake AVX512 kernels | |||
* Please use LLVM version 18 and above (version 19 and above on Windows) if you plan to use | |||
its new flang compiler for Fortran | |||
* Please use GCC version 11 and above to compile OpenBLAS on the POWER architecture | |||
* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`), | |||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build | |||
the library with `BIGNUMA=1`. | |||
@@ -350,4 +366,4 @@ Please see Changelog.txt. | |||
## Donation | |||
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). | |||
Please see [the donations section](http://www.openmathlib.org/OpenBLAS/docs/about/#donations) in the docs. |
@@ -141,7 +141,7 @@ jobs: | |||
- job: OSX_OpenMP | |||
pool: | |||
vmImage: 'macOS-12' | |||
vmImage: 'macOS-13' | |||
steps: | |||
- script: | | |||
brew update | |||
@@ -151,7 +151,7 @@ jobs: | |||
- job: OSX_GCC_Nothreads | |||
pool: | |||
vmImage: 'macOS-12' | |||
vmImage: 'macOS-13' | |||
steps: | |||
- script: | | |||
brew update | |||
@@ -164,7 +164,19 @@ jobs: | |||
- script: | | |||
brew update | |||
make CC=gcc-12 FC=gfortran-12 | |||
- job: OSX_LLVM_flangnew | |||
pool: | |||
vmImage: 'macOS-latest' | |||
variables: | |||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
steps: | |||
- script: | | |||
brew update | |||
brew install llvm flang | |||
make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/Cellar/flang/19.1.7_1/bin/flang-new NO_SHARED=1 | |||
- job: OSX_OpenMP_Clang | |||
pool: | |||
vmImage: 'macOS-latest' | |||
@@ -195,7 +207,7 @@ jobs: | |||
- job: OSX_dynarch_cmake | |||
pool: | |||
vmImage: 'macOS-12' | |||
vmImage: 'macOS-13' | |||
variables: | |||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
@@ -242,7 +254,7 @@ jobs: | |||
- job: OSX_NDK_ARMV7 | |||
pool: | |||
vmImage: 'macOS-12' | |||
vmImage: 'macOS-13' | |||
steps: | |||
- script: | | |||
brew update | |||
@@ -252,7 +264,7 @@ jobs: | |||
- job: OSX_IOS_ARMV8 | |||
pool: | |||
vmImage: 'macOS-12' | |||
vmImage: 'macOS-13' | |||
variables: | |||
CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch arm64 -miphoneos-version-min=10.0 | |||
@@ -262,7 +274,7 @@ jobs: | |||
- job: OSX_IOS_ARMV7 | |||
pool: | |||
vmImage: 'macOS-12' | |||
vmImage: 'macOS-13' | |||
variables: | |||
CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch armv7 -miphoneos-version-min=5.1 | |||
@@ -272,7 +284,7 @@ jobs: | |||
- job: OSX_xbuild_DYNAMIC_ARM64 | |||
pool: | |||
vmImage: 'macOS-12' | |||
vmImage: 'macOS-13' | |||
variables: | |||
CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -arch arm64 | |||
@@ -43,7 +43,17 @@ have all what it takes to build OpenBLAS from source, plus `python` and | |||
$ python -mpip install numpy meson ninja pytest pytest-benchmark | |||
``` | |||
The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/test_blas.py`. | |||
The Meson build system looks for the installed OpenBLAS using pkgconfig, so the openblas.pc created during the OpenBLAS build needs | |||
to be somewhere on the search path of pkgconfig or in a folder pointed to by the environment variable PKG_CONFIG_PATH. | |||
If you want to build the benchmark suite using flang (or flang-new) instead of gfortran for the Fortran parts, you currently need | |||
to edit the meson.build file and change the line `'fortran_std=legacy'` to `'fortran_std=none'` to work around an incompatibility | |||
between Meson and flang. | |||
If you are building and running the benchmark under MS Windows, it may be necessary to copy the generated openblas_wrap module from | |||
your build folder to the `benchmarks` folder. | |||
The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/bench_blas.py`. | |||
An ASV compatible benchmark suite is planned but currently not implemented. | |||
@@ -6,6 +6,9 @@ hostarch=`uname -m | sed -e 's/i.86/x86/'` | |||
if [ "$hostos" = "AIX" ] || [ "$hostos" = "SunOS" ]; then | |||
hostarch=`uname -p` | |||
fi | |||
if [ "$hostarch" = "evbarm" ]; then | |||
hostarch=`uname -p` | |||
fi | |||
case "$hostarch" in | |||
amd64) hostarch=x86_64 ;; | |||
arm*) [ "$hostarch" = "arm64" ] || hostarch='arm' ;; | |||
@@ -45,13 +45,15 @@ if (NOT ONLY_CBLAS) | |||
# TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile | |||
# TODO: set FEXTRALIB flags a la f_check? | |||
if (NOT (${CMAKE_SYSTEM_NAME} MATCHES "Windows" AND x${CMAKE_Fortran_COMPILER_ID} MATCHES "IntelLLVM")) | |||
set(BU "_") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define BUNDERSCORE _\n" | |||
"#define NEEDBUNDERSCORE 1\n" | |||
"#define NEED2UNDERSCORES 0\n") | |||
else () | |||
set (FCOMMON_OPT "${FCOMMON_OPT} /fp:precise /recursive /names:lowercase /assume:nounderscore") | |||
endif() | |||
else () | |||
#When we only build CBLAS, we set NOFORTRAN=2 | |||
@@ -269,6 +269,31 @@ if (${F_COMPILER} STREQUAL "CRAY") | |||
endif () | |||
endif () | |||
if (${F_COMPILER} STREQUAL "NAGFOR") | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_NAG") | |||
if (INTERFACE64) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8") | |||
endif () | |||
# Options from Makefile.system | |||
# -dcfuns: Enable non-standard double precision complex intrinsic functions | |||
# -ieee=full: enables all IEEE arithmetic facilities including non-stop arithmetic. | |||
# -w=obs: Suppress warning messages about obsolescent features | |||
# -thread_safe: Compile code for safe execution in a multi-threaded environment. | |||
# -recursive: Specifies that procedures are RECURSIVE by default. | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -dcfuns -recursive -ieee=full -w=obs -thread_safe") | |||
# Options from Reference-LAPACK | |||
# Suppress compiler banner and summary | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -quiet") | |||
# Disable other common warnings | |||
# -w=x77: Suppress warning messages about Fortran 77 features | |||
# -w=ques: Suppress warning messages about questionable usage | |||
# -w=unused: Suppress warning messages about unused variables | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
endif () | |||
endif () | |||
# from the root Makefile - this is for lapack-netlib to compile the correct secnd file. | |||
if (${F_COMPILER} STREQUAL "GFORTRAN") | |||
set(TIMER "INT_ETIME") | |||
@@ -79,6 +79,9 @@ macro(SetDefaultL1) | |||
SetFallback(CROTKERNEL zrot.S) | |||
SetFallback(ZROTKERNEL zrot.S) | |||
SetFallback(XROTKERNEL zrot.S) | |||
SetFallback(SROTMKERNEL rotm.S) | |||
SetFallback(DROTMKERNEL rotm.S) | |||
SetFallback(QROTMKERNEL rotm.S) | |||
SetFallback(SSCALKERNEL scal.S) | |||
SetFallback(DSCALKERNEL scal.S) | |||
SetFallback(CSCALKERNEL zscal.S) | |||
@@ -1018,7 +1018,12 @@ foreach (LA_FILE ${LA_GEN_SRC}) | |||
endforeach () | |||
if (NOT C_LAPACK) | |||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") | |||
# The below line is duplicating Fortran flags but NAG has a few flags | |||
# that cannot be specified twice. It's possible this is not needed for | |||
# any compiler, but for safety, we only turn off for NAG | |||
if (NOT ${F_COMPILER} STREQUAL "NAGFOR") | |||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") | |||
endif () | |||
if (${F_COMPILER} STREQUAL "GFORTRAN") | |||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize") | |||
endif() | |||
@@ -58,7 +58,7 @@ set(TARGET_CONF_TEMP "${PROJECT_BINARY_DIR}/${TARGET_CONF}.tmp") | |||
# c_check | |||
set(FU "") | |||
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) | |||
if (APPLE OR (MSVC AND NOT (${CMAKE_C_COMPILER_ID} MATCHES "Clang" OR ${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM"))) | |||
set(FU "_") | |||
endif() | |||
if(MINGW AND NOT MINGW64) | |||
@@ -1433,7 +1433,9 @@ else(NOT CMAKE_CROSSCOMPILING) | |||
message(STATUS "MSVC") | |||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) | |||
else() | |||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) | |||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") | |||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) | |||
endif() | |||
if (DEFINED TARGET_CORE) | |||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE}) | |||
endif () | |||
@@ -631,6 +631,18 @@ set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") | |||
endif() | |||
# TODO: not sure what PFLAGS is -hpa | |||
set(PFLAGS "${PFLAGS} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") | |||
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
if ("${F_COMPILER}" STREQUAL "FLANG") | |||
if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) | |||
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") | |||
endif () | |||
endif () | |||
if (ARM64 AND CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Windows") | |||
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -O2") | |||
endif () | |||
endif () | |||
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}") | |||
# TODO: not sure what FPFLAGS is -hpa | |||
@@ -656,7 +668,7 @@ if (CMAKE_Fortran_COMPILER) | |||
if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") | |||
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
message(STATUS "removing fortran flags") | |||
message(STATUS "removing fortran flags not supported by the compiler") | |||
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") | |||
endif () | |||
foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||
@@ -687,13 +699,6 @@ if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL | |||
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") | |||
endif () | |||
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
if ("${F_COMPILER}" STREQUAL "FLANG") | |||
if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) | |||
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") | |||
endif () | |||
endif () | |||
endif () | |||
if (NOT DEFINED SUFFIX) | |||
set(SUFFIX o) | |||
@@ -10,6 +10,10 @@ if (${HOST_OS} STREQUAL "WINDOWS") | |||
set(HOST_OS WINNT) | |||
endif () | |||
if (${HOST_OS} STREQUAL "IOS") | |||
set(HOST_OS DARWIN) | |||
endif () | |||
if (${HOST_OS} STREQUAL "LINUX") | |||
# check if we're building natively on Android (TERMUX) | |||
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) | |||
@@ -16,6 +16,14 @@ endfunction () | |||
macro(ParseMakefileVars MAKEFILE_IN) | |||
message(STATUS "Reading vars from ${MAKEFILE_IN}...") | |||
set (C_COMPILER ${CMAKE_C_COMPILER_ID}) | |||
set (OSNAME ${CMAKE_SYSTEM_NAME}) | |||
if (${C_COMPILER} MATCHES Clang) | |||
set (C_COMPILER CLANG) | |||
endif () | |||
if (${OSNAME} STREQUAL Windows) | |||
set (OSNAME WINNT) | |||
endif () | |||
message(STATUS OS ${OSNAME} COMPILER ${C_COMPILER}) | |||
set (IfElse 0) | |||
set (ElseSeen 0) | |||
set (SkipIfs 0) | |||
@@ -372,6 +372,12 @@ typedef int blasint; | |||
#endif | |||
#endif | |||
#if defined(ARCH_RISCV64) | |||
#ifndef YIELDING | |||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
#endif | |||
#endif | |||
#ifdef __EMSCRIPTEN__ | |||
#define YIELDING | |||
@@ -102,9 +102,16 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
#if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||
#if !defined(__APPLE__) && !defined(_WIN32) | |||
#define OPENBLAS_ARM_TYPE_FUNCTION .type REALNAME, %function ; | |||
#else | |||
#define OPENBLAS_ARM_TYPE_FUNCTION | |||
#endif | |||
#define PROLOGUE \ | |||
.arm ;\ | |||
.global REALNAME ;\ | |||
OPENBLAS_ARM_TYPE_FUNCTION \ | |||
REALNAME: | |||
#define EPILOGUE | |||
@@ -22,6 +22,7 @@ | |||
#define DSUM_K dsum_k | |||
#define DSWAP_K dswap_k | |||
#define DROT_K drot_k | |||
#define DROTM_K drotm_k | |||
#define DGEMV_N dgemv_n | |||
#define DGEMV_T dgemv_t | |||
@@ -180,6 +181,7 @@ | |||
#define DSUM_K gotoblas -> dsum_k | |||
#define DSWAP_K gotoblas -> dswap_k | |||
#define DROT_K gotoblas -> drot_k | |||
#define DROTM_K gotoblas -> drotm_k | |||
#define DGEMV_N gotoblas -> dgemv_n | |||
#define DGEMV_T gotoblas -> dgemv_t | |||
@@ -213,9 +213,9 @@ int srotmg_k(float *, float *, float *, float *, float *); | |||
int drotmg_k(double *, double *, double *, double *, double *); | |||
int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); | |||
int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); | |||
int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); | |||
int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); | |||
int srotm_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
int drotm_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
int qrotm_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||
int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); | |||
@@ -70,6 +70,7 @@ | |||
#define SUM_K QSUM_K | |||
#define SWAP_K QSWAP_K | |||
#define ROT_K QROT_K | |||
#define ROTM_K QROTM_K | |||
#define GEMV_N QGEMV_N | |||
#define GEMV_T QGEMV_T | |||
@@ -361,6 +362,7 @@ | |||
#define SUM_K DSUM_K | |||
#define SWAP_K DSWAP_K | |||
#define ROT_K DROT_K | |||
#define ROTM_K DROTM_K | |||
#define GEMV_N DGEMV_N | |||
#define GEMV_T DGEMV_T | |||
@@ -977,6 +979,7 @@ | |||
#define SUM_K SSUM_K | |||
#define SWAP_K SSWAP_K | |||
#define ROT_K SROT_K | |||
#define ROTM_K SROTM_K | |||
#define GEMV_N SGEMV_N | |||
#define GEMV_T SGEMV_T | |||
@@ -77,6 +77,7 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); | |||
double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
int (*sbrotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
@@ -197,6 +198,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
//double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
int (*srotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
#endif | |||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | |||
int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
@@ -336,6 +338,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
#endif | |||
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | |||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | |||
int (*drotm_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
@@ -445,6 +448,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | |||
int (*qrotm_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||
int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
@@ -22,6 +22,7 @@ | |||
#define QSUM_K qsum_k | |||
#define QSWAP_K qswap_k | |||
#define QROT_K qrot_k | |||
#define QROTM_K qrotm_k | |||
#define QGEMV_N qgemv_n | |||
#define QGEMV_T qgemv_t | |||
@@ -165,6 +166,7 @@ | |||
#define QSUM_K gotoblas -> qsum_k | |||
#define QSWAP_K gotoblas -> qswap_k | |||
#define QROT_K gotoblas -> qrot_k | |||
#define QROTM_K gotoblas -> qrotm_k | |||
#define QGEMV_N gotoblas -> qgemv_n | |||
#define QGEMV_T gotoblas -> qgemv_t | |||
@@ -24,6 +24,7 @@ | |||
#define SSCAL_K sscal_k | |||
#define SSWAP_K sswap_k | |||
#define SROT_K srot_k | |||
#define SROTM_K srotm_k | |||
#define SGEMV_N sgemv_n | |||
#define SGEMV_T sgemv_t | |||
@@ -189,6 +190,7 @@ | |||
#define SSCAL_K gotoblas -> sscal_k | |||
#define SSWAP_K gotoblas -> sswap_k | |||
#define SROT_K gotoblas -> srot_k | |||
#define SROTM_K gotoblas -> srotm_k | |||
#define SGEMV_N gotoblas -> sgemv_n | |||
#define SGEMV_T gotoblas -> sgemv_t | |||
@@ -25,6 +25,7 @@ | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <stdlib.h> | |||
#include <string.h> | |||
#ifdef __APPLE__ | |||
#include <sys/sysctl.h> | |||
@@ -33,6 +34,23 @@ size_t length=sizeof(value); | |||
int64_t value64; | |||
size_t length64=sizeof(value64); | |||
#endif | |||
#if (defined OS_LINUX || defined OS_ANDROID) | |||
#include <asm/hwcap.h> | |||
#include <sys/auxv.h> | |||
#ifndef HWCAP_CPUID | |||
#define HWCAP_CPUID (1 << 11) | |||
#endif | |||
#ifndef HWCAP_SVE | |||
#define HWCAP_SVE (1 << 22) | |||
#endif | |||
#if (defined OS_WINDOWS) | |||
#include <winreg.h> | |||
#endif | |||
#define get_cpu_ftr(id, var) ({ \ | |||
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | |||
}) | |||
#endif | |||
#define CPU_UNKNOWN 0 | |||
#define CPU_ARMV8 1 | |||
@@ -42,11 +60,11 @@ size_t length64=sizeof(value64); | |||
#define CPU_CORTEXA57 3 | |||
#define CPU_CORTEXA72 4 | |||
#define CPU_CORTEXA73 5 | |||
#define CPU_CORTEXA76 23 | |||
#define CPU_CORTEXA76 23 | |||
#define CPU_NEOVERSEN1 11 | |||
#define CPU_NEOVERSEV1 16 | |||
#define CPU_NEOVERSEN2 17 | |||
#define CPU_NEOVERSEV2 24 | |||
#define CPU_NEOVERSEV2 24 | |||
#define CPU_CORTEXX1 18 | |||
#define CPU_CORTEXX2 19 | |||
#define CPU_CORTEXA510 20 | |||
@@ -93,7 +111,7 @@ static char *cpuname[] = { | |||
"CORTEXA710", | |||
"FT2000", | |||
"CORTEXA76", | |||
"NEOVERSEV2" | |||
"NEOVERSEV2" | |||
}; | |||
static char *cpuname_lower[] = { | |||
@@ -121,13 +139,17 @@ static char *cpuname_lower[] = { | |||
"cortexa710", | |||
"ft2000", | |||
"cortexa76", | |||
"neoversev2" | |||
"neoversev2" | |||
}; | |||
static int cpulowperf=0; | |||
static int cpumidperf=0; | |||
static int cpuhiperf=0; | |||
int get_feature(char *search) | |||
{ | |||
#ifdef __linux | |||
#if defined( __linux ) || defined( __NetBSD__ ) | |||
FILE *infile; | |||
char buffer[2048], *p,*t; | |||
p = (char *) NULL ; | |||
@@ -158,33 +180,108 @@ int get_feature(char *search) | |||
#endif | |||
return(0); | |||
} | |||
static int cpusort(const void *model1, const void *model2) | |||
{ | |||
return (*(int*)model2-*(int*)model1); | |||
} | |||
int detect(void) | |||
{ | |||
#ifdef __linux | |||
#if defined( __linux ) || defined( __NetBSD__ ) | |||
int n,i,ii; | |||
int midr_el1; | |||
int implementer; | |||
int cpucap[1024]; | |||
int cpucores[1024]; | |||
FILE *infile; | |||
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; | |||
char cpupart[6],cpuimpl[6]; | |||
char *cpu_impl=NULL,*cpu_pt=NULL; | |||
char buffer[2048], *p, *cpu_part = NULL, *cpu_implementer = NULL; | |||
p = (char *) NULL ; | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
while (fgets(buffer, sizeof(buffer), infile)) { | |||
if ((cpu_part != NULL) && (cpu_implementer != NULL)) { | |||
break; | |||
cpulowperf=cpumidperf=cpuhiperf=0; | |||
for (i=0;i<1024;i++)cpucores[i]=0; | |||
n=0; | |||
infile = fopen("/sys/devices/system/cpu/possible", "r"); | |||
if (!infile) { | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
while (fgets(buffer, sizeof(buffer), infile)) { | |||
if (!strncmp("processor", buffer, 9)) | |||
n++; | |||
} | |||
if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) { | |||
cpu_part = strchr(buffer, ':') + 2; | |||
cpu_part = strdup(cpu_part); | |||
} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) { | |||
cpu_implementer = strchr(buffer, ':') + 2; | |||
cpu_implementer = strdup(cpu_implementer); | |||
} else { | |||
fgets(buffer, sizeof(buffer), infile); | |||
sscanf(buffer,"0-%d",&n); | |||
n++; | |||
} | |||
fclose(infile); | |||
cpu_implementer=NULL; | |||
for (i=0;i<n;i++){ | |||
sprintf(buffer,"/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1",i); | |||
infile= fopen(buffer,"r"); | |||
if (!infile) { | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
for (ii=0;ii<n;ii++){ | |||
cpu_part=NULL;cpu_implementer=NULL; | |||
while (fgets(buffer, sizeof(buffer), infile)) { | |||
if ((cpu_part != NULL) && (cpu_implementer != NULL)) { | |||
break; | |||
} | |||
if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) { | |||
cpu_pt = strchr(buffer, ':') + 2; | |||
cpu_part = strdup(cpu_pt); | |||
cpucores[i]=strtol(cpu_part,NULL,0); | |||
} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) { | |||
cpu_impl = strchr(buffer, ':') + 2; | |||
cpu_implementer = strdup(cpu_impl); | |||
} | |||
} | |||
if (strstr(cpu_implementer, "0x41")) { | |||
if (cpucores[ii] >= 0xd4b) cpuhiperf++; | |||
else | |||
if (cpucores[ii] >= 0xd07) cpumidperf++; | |||
else cpulowperf++; | |||
} | |||
else cpulowperf++; | |||
} | |||
fclose(infile); | |||
break; | |||
} else { | |||
(void)fgets(buffer, sizeof(buffer), infile); | |||
midr_el1=strtoul(buffer,NULL,16); | |||
fclose(infile); | |||
implementer = (midr_el1 >> 24) & 0xFF; | |||
cpucores[i] = (midr_el1 >> 4) & 0xFFF; | |||
sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capacity",i); | |||
infile= fopen(buffer,"r"); | |||
if (!infile) { | |||
if (implementer== 65) { | |||
if (cpucores[i] >= 0xd4b) cpuhiperf++; | |||
else | |||
if (cpucores[i] >= 0xd07) cpumidperf++; | |||
else cpulowperf++; | |||
} | |||
else cpulowperf++; | |||
} else { | |||
(void)fgets(buffer, sizeof(buffer), infile); | |||
sscanf(buffer,"%d",&cpucap[i]); | |||
if (cpucap[i] >= 1000) cpuhiperf++; | |||
else | |||
if (cpucap[i] >= 500) cpumidperf++; | |||
else cpulowperf++; | |||
fclose(infile); | |||
} | |||
} | |||
sprintf(cpuimpl,"0x%2x",implementer); | |||
cpu_implementer=strdup(cpuimpl); | |||
} | |||
fclose(infile); | |||
qsort(cpucores,1024,sizeof(int),cpusort); | |||
sprintf(cpupart,"0x%3x",cpucores[0]); | |||
cpu_part=strdup(cpupart); | |||
if(cpu_part != NULL && cpu_implementer != NULL) { | |||
// Arm | |||
if (strstr(cpu_implementer, "0x41")) { | |||
@@ -219,7 +316,7 @@ int detect(void) | |||
else if (strstr(cpu_part, "0xd4f")) //NVIDIA Grace et al. | |||
return CPU_NEOVERSEV2; | |||
else if (strstr(cpu_part, "0xd0b")) | |||
return CPU_CORTEXA76; | |||
return CPU_CORTEXA76; | |||
} | |||
// Qualcomm | |||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) | |||
@@ -277,11 +374,42 @@ int detect(void) | |||
} | |||
#else | |||
#ifdef __APPLE__ | |||
sysctlbyname("hw.ncpu",&value64,&length64,NULL,0); | |||
cpulowperf=value64; | |||
sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); | |||
if (value64 > 1) { | |||
sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0); | |||
cpuhiperf=value64; | |||
sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0); | |||
cpulowperf=value64; | |||
} | |||
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); | |||
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 | |||
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 | |||
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 | |||
if (value64 == 1867590060) return CPU_VORTEX; //M4 | |||
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 | |||
if (value64 == 1867590060) return CPU_VORTEX; //M4 | |||
#else | |||
#ifdef OS_WINDOWS | |||
HKEY reghandle; | |||
HKEY hklm = HKEY_LOCAL_MACHINE; | |||
WCHAR valstring[512]; | |||
PVOID pvalstring=valstring; | |||
DWORD size=sizeof (valstring); | |||
DWORD type=RRF_RT_ANY; | |||
DWORD flags=0; | |||
LPCWSTR subkey= L"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"; | |||
LPCWSTR field=L"ProcessorNameString"; | |||
LONG errcode=RegOpenKeyEx(HKEY_LOCAL_MACHINE,TEXT("Hardware\\Description\\System\\CentralProcessor\\0"), 0, KEY_READ, ®handle); | |||
if (errcode != NO_ERROR) wprintf(L"Could not open registry key for proc0: %x\n",errcode); | |||
errcode=RegQueryValueEx(reghandle, "ProcessorNameString", NULL,NULL ,pvalstring,&size); | |||
if (errcode != ERROR_SUCCESS) wprintf(L"Error reading cpuname from registry:%x\n",errcode); | |||
//wprintf(stderr,L"%s\n",(PWSTR)valstring); | |||
RegCloseKey(reghandle); | |||
if (strstr(valstring, "Snapdragon(R) X Elite")) return CPU_NEOVERSEN1; | |||
if (strstr(valstring, "Ampere(R) Altra")) return CPU_NEOVERSEN1; | |||
if (strstr(valstring, "Snapdragon (TM) 8cx Gen 3")) return CPU_CORTEXX1; | |||
if (strstr(valstring, "Snapdragon Compute Platform")) return CPU_CORTEXX1; | |||
#endif | |||
#endif | |||
return CPU_ARMV8; | |||
#endif | |||
@@ -314,7 +442,7 @@ void get_cpucount(void) | |||
{ | |||
int n=0; | |||
#ifdef __linux | |||
#if defined( __linux ) || defined( __NetBSD__ ) | |||
FILE *infile; | |||
char buffer[2048], *p,*t; | |||
p = (char *) NULL ; | |||
@@ -331,10 +459,22 @@ int n=0; | |||
fclose(infile); | |||
printf("#define NUM_CORES %d\n",n); | |||
if (cpulowperf >0) | |||
printf("#define NUM_CORES_LP %d\n",cpulowperf); | |||
if (cpumidperf >0) | |||
printf("#define NUM_CORES_MP %d\n",cpumidperf); | |||
if (cpuhiperf >0) | |||
printf("#define NUM_CORES_HP %d\n",cpuhiperf); | |||
#endif | |||
#ifdef __APPLE__ | |||
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | |||
printf("#define NUM_CORES %d\n",value); | |||
if (cpulowperf >0) | |||
printf("#define NUM_CORES_LP %d\n",cpulowperf); | |||
if (cpumidperf >0) | |||
printf("#define NUM_CORES_MP %d\n",cpumidperf); | |||
if (cpuhiperf >0) | |||
printf("#define NUM_CORES_HP %d\n",cpuhiperf); | |||
#endif | |||
} | |||
@@ -347,7 +487,6 @@ void get_cpuconfig(void) | |||
printf("#define ARMV8\n"); | |||
printf("#define HAVE_NEON\n"); // This shouldn't be necessary | |||
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary | |||
int d = detect(); | |||
switch (d) | |||
{ | |||
@@ -402,8 +541,8 @@ void get_cpuconfig(void) | |||
break; | |||
case CPU_NEOVERSEV1: | |||
printf("#define HAVE_SVE 1\n"); | |||
case CPU_CORTEXA76: | |||
printf("#define HAVE_SVE 1\n"); | |||
case CPU_CORTEXA76: | |||
printf("#define %s\n", cpuname[d]); | |||
printf("#define L1_CODE_SIZE 65536\n"); | |||
printf("#define L1_CODE_LINESIZE 64\n"); | |||
@@ -431,32 +570,32 @@ void get_cpuconfig(void) | |||
printf("#define L2_ASSOCIATIVE 8\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define HAVE_SVE 1\n"); | |||
printf("#define HAVE_SVE 1\n"); | |||
break; | |||
case CPU_NEOVERSEV2: | |||
case CPU_NEOVERSEV2: | |||
printf("#define ARMV9\n"); | |||
printf("#define HAVE_SVE 1\n"); | |||
printf("#define %s\n", cpuname[d]); | |||
printf("#define L1_CODE_SIZE 65536\n"); | |||
printf("#define L1_CODE_LINESIZE 64\n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
printf("#define L1_DATA_SIZE 65536\n"); | |||
printf("#define L1_DATA_LINESIZE 64\n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
printf("#define L2_SIZE 1048576\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define L2_ASSOCIATIVE 8\n"); | |||
// L1 Data TLB = 48 entries | |||
// L2 Data TLB = 2048 entries | |||
printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||
printf("#define DTB_SIZE 4096\n"); // Set to 4096 for symmetry with other configs. | |||
break; | |||
printf("#define HAVE_SVE 1\n"); | |||
printf("#define %s\n", cpuname[d]); | |||
printf("#define L1_CODE_SIZE 65536\n"); | |||
printf("#define L1_CODE_LINESIZE 64\n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
printf("#define L1_DATA_SIZE 65536\n"); | |||
printf("#define L1_DATA_LINESIZE 64\n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
printf("#define L2_SIZE 1048576\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define L2_ASSOCIATIVE 8\n"); | |||
// L1 Data TLB = 48 entries | |||
// L2 Data TLB = 2048 entries | |||
printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||
printf("#define DTB_SIZE 4096\n"); // Set to 4096 for symmetry with other configs. | |||
break; | |||
case CPU_CORTEXA510: | |||
case CPU_CORTEXA710: | |||
case CPU_CORTEXX1: | |||
case CPU_CORTEXX2: | |||
printf("#define ARMV9\n"); | |||
printf("#define HAVE_SVE 1\n"); | |||
printf("#define HAVE_SVE 1\n"); | |||
printf("#define %s\n", cpuname[d]); | |||
printf("#define L1_CODE_SIZE 65536\n"); | |||
printf("#define L1_CODE_LINESIZE 64\n"); | |||
@@ -559,8 +698,6 @@ void get_cpuconfig(void) | |||
case CPU_VORTEX: | |||
printf("#define VORTEX \n"); | |||
#ifdef __APPLE__ | |||
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); | |||
if (value64 == 1867590060) printf("#define HAVE_SME 1\n");; //M4 | |||
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||
printf("#define L1_CODE_SIZE %lld \n",value64); | |||
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||
@@ -575,7 +712,7 @@ void get_cpuconfig(void) | |||
break; | |||
case CPU_A64FX: | |||
printf("#define A64FX\n"); | |||
printf("#define HAVE_SVE 1\n"); | |||
printf("#define HAVE_SVE 1\n"); | |||
printf("#define L1_CODE_SIZE 65535\n"); | |||
printf("#define L1_DATA_SIZE 65535\n"); | |||
printf("#define L1_DATA_LINESIZE 256\n"); | |||
@@ -608,7 +745,7 @@ void get_libname(void) | |||
void get_features(void) | |||
{ | |||
#ifdef __linux | |||
#if defined( __linux ) || defined( __NetBSD__ ) | |||
FILE *infile; | |||
char buffer[2048], *p,*t; | |||
p = (char *) NULL ; | |||
@@ -41,7 +41,7 @@ | |||
IF (PASS) THEN | |||
WRITE (NOUT,99998) | |||
ELSE | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
* | |||
@@ -231,7 +231,7 @@ | |||
CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
40 CONTINUE | |||
@@ -515,7 +515,7 @@ | |||
CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
40 CONTINUE | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -243,7 +243,7 @@ | |||
$ GO TO 70 | |||
60 CONTINUE | |||
WRITE( NOUT, FMT = 9986 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
70 LTEST( I ) = LTESTT | |||
GO TO 50 | |||
* | |||
@@ -283,7 +283,7 @@ | |||
SAME = LCE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANS = 'T' | |||
CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, | |||
@@ -291,7 +291,7 @@ | |||
SAME = LCE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -419,7 +419,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -194,7 +194,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -237,7 +237,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -246,7 +246,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -264,7 +264,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -273,7 +273,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -386,7 +386,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -194,7 +194,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -237,7 +237,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -246,7 +246,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -264,7 +264,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -273,7 +273,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -386,7 +386,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -47,7 +47,7 @@ | |||
IF (PASS) THEN | |||
WRITE (NOUT,99998) | |||
ELSE | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
* | |||
@@ -139,7 +139,7 @@ | |||
CALL STEST1(SS,DS1(K),DS1(K),SFAC) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
40 RETURN | |||
@@ -232,7 +232,7 @@ | |||
CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
60 CONTINUE | |||
80 CONTINUE | |||
@@ -387,7 +387,7 @@ | |||
CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
100 CONTINUE | |||
120 CONTINUE | |||
@@ -475,7 +475,7 @@ | |||
70 CONTINUE | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
40 CONTINUE | |||
60 CONTINUE | |||
@@ -10,7 +10,7 @@ | |||
* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -239,7 +239,7 @@ | |||
$ GO TO 70 | |||
60 CONTINUE | |||
WRITE( NOUT, FMT = 9986 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
70 LTEST( I ) = LTESTT | |||
GO TO 50 | |||
* | |||
@@ -279,7 +279,7 @@ | |||
SAME = LDE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANS = 'T' | |||
CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, | |||
@@ -287,7 +287,7 @@ | |||
SAME = LDE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -415,7 +415,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -189,7 +189,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -232,7 +232,7 @@ | |||
SAME = LDE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'T' | |||
CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -241,7 +241,7 @@ | |||
SAME = LDE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -259,7 +259,7 @@ | |||
SAME = LDE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'T' | |||
CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -268,7 +268,7 @@ | |||
SAME = LDE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -380,7 +380,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -47,7 +47,7 @@ | |||
IF (PASS) THEN | |||
WRITE (NOUT,99998) | |||
ELSE | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
* | |||
@@ -139,7 +139,7 @@ | |||
CALL STEST1(SS,DS1(K),DS1(K),SFAC) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
40 RETURN | |||
@@ -232,7 +232,7 @@ | |||
CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
60 CONTINUE | |||
80 CONTINUE | |||
@@ -387,7 +387,7 @@ | |||
CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
100 CONTINUE | |||
120 CONTINUE | |||
@@ -482,7 +482,7 @@ | |||
70 CONTINUE | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
40 CONTINUE | |||
60 CONTINUE | |||
@@ -10,7 +10,7 @@ | |||
* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -239,7 +239,7 @@ | |||
$ GO TO 70 | |||
60 CONTINUE | |||
WRITE( NOUT, FMT = 9986 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
70 LTEST( I ) = LTESTT | |||
GO TO 50 | |||
* | |||
@@ -279,7 +279,7 @@ | |||
SAME = LSE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANS = 'T' | |||
CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, | |||
@@ -287,7 +287,7 @@ | |||
SAME = LSE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -415,7 +415,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -188,7 +188,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -231,7 +231,7 @@ | |||
SAME = LSE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'T' | |||
CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -240,7 +240,7 @@ | |||
SAME = LSE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -258,7 +258,7 @@ | |||
SAME = LSE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'T' | |||
CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -267,7 +267,7 @@ | |||
SAME = LSE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -379,7 +379,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -41,7 +41,7 @@ | |||
IF (PASS) THEN | |||
WRITE (NOUT,99998) | |||
ELSE | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
* | |||
@@ -231,7 +231,7 @@ | |||
CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
40 CONTINUE | |||
@@ -515,7 +515,7 @@ | |||
CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
40 CONTINUE | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -243,7 +243,7 @@ | |||
$ GO TO 70 | |||
60 CONTINUE | |||
WRITE( NOUT, FMT = 9986 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
70 LTEST( I ) = LTESTT | |||
GO TO 50 | |||
* | |||
@@ -283,7 +283,7 @@ | |||
SAME = LZE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANS = 'T' | |||
CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, | |||
@@ -291,7 +291,7 @@ | |||
SAME = LZE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -419,7 +419,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -195,7 +195,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -238,7 +238,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -247,7 +247,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -265,7 +265,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -274,7 +274,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -387,7 +387,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -195,7 +195,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -238,7 +238,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -247,7 +247,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -265,7 +265,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -274,7 +274,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -387,7 +387,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,6 +10,15 @@ | |||
#define int long | |||
#endif | |||
#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER) | |||
//#define LAPACK_COMPLEX_STRUCTURE | |||
#define NOCHANGE | |||
#endif | |||
/* e.g. mingw64/x86_64-w64-mingw32/include/winerror.h */ | |||
#ifdef FAILED | |||
#undef FAILED | |||
#endif | |||
#define TRUE 1 | |||
#define PASSED 1 | |||
#define TEST_ROW_MJR 1 | |||
@@ -1,104 +1,122 @@ | |||
This page describes the Make-based build, which is the default/authoritative | |||
build method. Note that the OpenBLAS repository also supports building with | |||
CMake (not described here) - that generally works and is tested, however there | |||
may be small differences between the Make and CMake builds. | |||
!!! info "Supported build systems" | |||
This page describes the Make-based build, which is the | |||
default/authoritative build method. Note that the OpenBLAS repository also | |||
supports building with CMake (not described here) - that generally works | |||
and is tested, however there may be small differences between the Make and | |||
CMake builds. | |||
## Makefile dependency graph | |||
<!--- | |||
An easy way to update this diagram is to copy it into https://mermaid.live | |||
and edit it interactively. | |||
--> | |||
```mermaid | |||
flowchart LR | |||
A[Makefile] -->|included by many of the Makefiles in the subdirectories!| B(Makefile.system) | |||
B -->|triggered, not included, once by Makefile.system, and runs before any of the actual library code is built. builds and runs the 'getarch' tool for cpu identification, runs the compiler detection scripts c_check/f_check| C{Makefile.prebuild} | |||
C -->|either this or Makefile_kernel.conf is generated| D[Makefile.conf] | |||
C -->|temporary Makefile.conf during DYNAMIC_ARCH builds| E[Makefile_kernel.conf] | |||
B -->|defaults for build options that can be given on the make command line| F[Makefile.rule] | |||
B -->|architecture-specific compiler options and OpenBLAS buffer size values| G[Makefile.$ARCH] | |||
A --> exports | |||
A -->|directories: test, ctest, utest, cpp_thread_test| H(test directories) | |||
A --> I($BLASDIRS) | |||
I --> interface | |||
I --> driver/level2 | |||
I --> driver/level3 | |||
I --> driver/others | |||
A -->|for each target in DYNAMIC_CORE if DYNAMIC_ARCH=1| kernel | |||
A -->|subdirs: timing, testing, testing/EIG, testing/LIN| J($NETLIB_LAPACK_DIR) | |||
A --> relapack | |||
``` | |||
!!! warning | |||
This page is made by someone who is not the developer and should not be considered as an official documentation of the build system. For getting the full picture, it is best to read the Makefiles and understand them yourself. | |||
## Makefile dep graph | |||
## Important Variables | |||
``` | |||
Makefile | |||
| | |||
|----- Makefile.system # !!! this is included by many of the Makefiles in the subdirectories !!! | |||
| | | |||
| |===== Makefile.prebuild # This is triggered (not included) once by Makefile.system | |||
| | | # and runs before any of the actual library code is built. | |||
| | | # (builds and runs the "getarch" tool for cpu identification, | |||
| | | # runs the compiler detection scripts c_check and f_check) | |||
| | | | |||
| | ----- (Makefile.conf) [ either this or Makefile_kernel.conf is generated ] | |||
| | | { Makefile.system#L243 } | |||
| | ----- (Makefile_kernel.conf) [ temporary Makefile.conf during DYNAMIC_ARCH builds ] | |||
| | | |||
| |----- Makefile.rule # defaults for build options that can be given on the make command line | |||
| | | |||
| |----- Makefile.$(ARCH) # architecture-specific compiler options and OpenBLAS buffer size values | |||
| | |||
|~~~~~ exports/ | |||
| | |||
|~~~~~ test/ | |||
| | |||
|~~~~~ utest/ | |||
| | |||
|~~~~~ ctest/ | |||
| | |||
|~~~~~ cpp_thread_test/ | |||
| | |||
|~~~~~ kernel/ | |||
| | |||
|~~~~~ ${SUBDIRS} | |||
| | |||
|~~~~~ ${BLASDIRS} | |||
| | |||
|~~~~~ ${NETLIB_LAPACK_DIR}{,/timing,/testing/{EIG,LIN}} | |||
| | |||
|~~~~~ relapack/ | |||
``` | |||
Most of the tunable variables are found in | |||
[Makefile.rule](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.rule), | |||
along with their detailed descriptions. | |||
## Important Variables | |||
Most of the variables are detected automatically in | |||
[Makefile.prebuild](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.prebuild), | |||
if they are not set in the environment. | |||
Most of the tunable variables are found in [Makefile.rule](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.rule), along with their detailed descriptions.<br/> | |||
Most of the variables are detected automatically in [Makefile.prebuild](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.prebuild), if they are not set in the environment. | |||
The most commonly used variables are documented below. There are more options | |||
though - please read the linked Makefiles if you want to see all variables. | |||
### CPU related | |||
``` | |||
ARCH - Target architecture (eg. x86_64) | |||
TARGET - Target CPU architecture, in case of DYNAMIC_ARCH=1 means library will not be usable on less capable CPUs | |||
TARGET_CORE - TARGET_CORE will override TARGET internally during each cpu-specific cycle of the build for DYNAMIC_ARCH | |||
DYNAMIC_ARCH - For building library for multiple TARGETs (does not lose any optimizations, but increases library size) | |||
DYNAMIC_LIST - optional user-provided subset of the DYNAMIC_CORE list in Makefile.system | |||
``` | |||
### Toolchain related | |||
``` | |||
CC - TARGET C compiler used for compilation (can be cross-toolchains) | |||
FC - TARGET Fortran compiler used for compilation (can be cross-toolchains, set NOFORTRAN=1 if used cross-toolchain has no fortran compiler) | |||
AR, AS, LD, RANLIB - TARGET toolchain helpers used for compilation (can be cross-toolchains) | |||
- `ARCH`: target architecture (e.g., `x86-64`). | |||
- `DYNAMIC_ARCH`: For building library for multiple `TARGET`s (does not lose any | |||
optimizations, but increases library size). | |||
- `DYNAMIC_LIST`: optional user-provided subset of the `DYNAMIC_CORE` list in | |||
[Makefile.system](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.system). | |||
- `TARGET`: target CPU architecture. In case of `DYNAMIC_ARCH=1`, it means that | |||
the library will not be usable on less capable CPUs. | |||
- `TARGET_CORE`: override `TARGET` internally during each CPU-specific cycle of | |||
the build for `DYNAMIC_ARCH`. | |||
HOSTCC - compiler of build machine, needed to create proper config files for target architecture | |||
HOST_CFLAGS - flags for build machine compiler | |||
``` | |||
### Library related | |||
``` | |||
BINARY - 32/64 bit library | |||
### Toolchain related | |||
BUILD_SHARED - Create shared library | |||
BUILD_STATIC - Create static library | |||
- `CC`: `TARGET` C compiler used for compilation (can be cross-toolchains). | |||
- `FC`: `TARGET` Fortran compiler used for compilation (can be cross-toolchains, | |||
set `NOFORTRAN=1` if the used cross-toolchain has no Fortran compiler). | |||
- `COMMON_OPT`: flags to add to all invocations of the target C and Fortran compilers | |||
(overrides `CFLAGS`/`FFLAGS` - prefer using `COMMON_OPT`) | |||
- `CCOMMON_OPT`: flags to add to all invocations of the target C compiler | |||
(overrides `CFLAGS`) | |||
- `FCOMMON_OPT`: flags to add to all invocations of the target Fortran compiler | |||
(overrides `FFLAGS`) | |||
- `LDFLAGS`: flags to add to all target linker invocations | |||
- `AR`, `AS`, `LD`, `RANLIB`: `TARGET` toolchain helpers used for compilation | |||
(can be cross-toolchains). | |||
- `HOSTCC`: compiler of build machine, needed to create proper config files for | |||
the target architecture. | |||
- `HOST_CFLAGS`: flags for the build machine compiler. | |||
QUAD_PRECISION - enable support for IEEE quad precision [ largely unimplemented leftover from GotoBLAS, do not use ] | |||
EXPRECISION - Obsolete option to use float80 of SSE on BSD-like systems | |||
INTERFACE64 - Build with 64bit integer representations to support large array index values [ incompatible with standard API ] | |||
BUILD_SINGLE - build the single-precision real functions of BLAS [and optionally LAPACK] | |||
BUILD_DOUBLE - build the double-precision real functions | |||
BUILD_COMPLEX - build the single-precision complex functions | |||
BUILD_COMPLEX16 - build the double-precision complex functions | |||
(all four types are included in the build by default when none was specifically selected) | |||
### Library related | |||
BUILD_BFLOAT16 - build the "half precision brainfloat" real functions | |||
#### Library kind and bitness options | |||
- `BINARY`: whether to build a 32-bit or 64-bit library (default is `64`, set | |||
to `32` on a 32-bit platform). | |||
- `INTERFACE64`: build with 64-bit (ILP64) integer representations to support | |||
large array index values (incompatible with the standard 32-bit integer (LP64) API). | |||
- `NO_STATIC`: if set to `1`, don't build a static library (default is `0`) | |||
- `NO_SHARED`: if set to `1`, don't build a shared library (default is `0`) | |||
#### Data type options | |||
- `BUILD_SINGLE`: build the single-precision real functions of BLAS and (if | |||
it's built) LAPACK | |||
- `BUILD_DOUBLE`: build the double-precision real functions | |||
- `BUILD_COMPLEX`: build the single-precision complex functions | |||
- `BUILD_COMPLEX16`: build the double-precision complex functions | |||
- `BUILD_BFLOAT16`: build the "half precision brainfloat" real functions | |||
- `EXPRECISION`: (do not use, this is a work in progress) option to use `long | |||
double` functions | |||
By default, the single- and double-precision real and complex floating-point | |||
functions are included in the build, while the half- and extended-precision | |||
functions are not. | |||
USE_THREAD - Use a multithreading backend (default to pthread) | |||
USE_LOCKING - implement locking for thread safety even when USE_THREAD is not set (so that the singlethreaded library can | |||
safely be called from multithreaded programs) | |||
USE_OPENMP - Use OpenMP as multithreading backend | |||
NUM_THREADS - define this to the maximum number of parallel threads you expect to need (defaults to the number of cores in the build cpu) | |||
NUM_PARALLEL - define this to the number of OpenMP instances that your code may use for parallel calls into OpenBLAS (default 1,see below) | |||
``` | |||
#### Threading options | |||
- `USE_THREAD`: Use a multithreading backend (defaults to `pthreads`). | |||
- `USE_LOCKING`: implement locking for thread safety even when `USE_THREAD` is | |||
not set (so that the single-threaded library can safely be called from | |||
multithreaded programs). | |||
- `USE_OPENMP`: Use OpenMP as multithreading backend | |||
- `NUM_THREADS`: define this to the maximum number of parallel threads you | |||
expect to need (defaults to the number of cores in the build CPU). | |||
- `NUM_PARALLEL`: define this to the number of OpenMP instances that your code | |||
may use for parallel calls into OpenBLAS (the default is `1`, see below). | |||
OpenBLAS uses a fixed set of memory buffers internally, used for communicating | |||
and compiling partial results from individual threads. For efficiency, the | |||
@@ -118,3 +136,32 @@ same time, then only one of them will be able to make progress while all the | |||
rest of them spin-wait for the one available buffer. Setting `NUM_PARALLEL` to | |||
the upper bound on the number of OpenMP runtimes that you can have in a process | |||
ensures that there are a sufficient number of buffer sets available. | |||
#### Library and symbol name options | |||
- `FIXED_LIBNAME`: if set to `1`, uses a non-versioned name for the library and | |||
no symbolic linking to variant names (default is `0`) | |||
- `LIBNAMEPREFIX`: prefix that, if given, will be inserted in the library name | |||
before `openblas` (e.g., `xxx` will result in `libxxxopenblas.so`) | |||
- `LIBNAMESUFFIX`: suffix that, if given, will be inserted in the library name | |||
after `openblas`, separated by an underscore (e.g., `yyy` will result in | |||
`libopenblas_yyy.so`) | |||
- `SYMBOLPREFIX`: prefix that, if given, will be added to all symbol names | |||
*and* to the library name | |||
- `SYMBOLSUFFIX`: suffix that, if given, will be added to all symbol names | |||
*and* to the library name | |||
#### BLAS and LAPACK options | |||
By default, the Fortran and C interfaces to BLAS and LAPACK are built, | |||
including deprecated functions, while | |||
[ReLAPACK](https://github.com/HPAC/ReLAPACK) is not. | |||
- `NO_CBLAS`: if set to `1`, don't build the CBLAS interface (default is `0`) | |||
- `ONLY_CBLAS`: if set to `1`, only build the CBLAS interface (default is `0`) | |||
- `NO_LAPACK`: if set to `1`, don't build LAPACK (default is `0`) | |||
- `NO_LAPACKE`: if set to `1`, don't build the LAPACKE interface (default is `0`) | |||
- `BUILD_LAPACK_DEPRECATED`: if set to `0`, don't build deprecated LAPACK | |||
functions (default is `1`) | |||
- `BUILD_RELAPACK`: if set to `1`, build Recursive LAPACK on top of LAPACK | |||
(default is `0`) |
@@ -5,14 +5,14 @@ This page documents those non-standard APIs. | |||
## BLAS-like extensions | |||
| Routine | Data Types | Description | | |||
| ------------- |:------------- | :---------------| | |||
| ?axpby | s,d,c,z | like axpy with a multiplier for y | | |||
| ?gemm3m | c,z | gemm3m | | |||
| ?imatcopy | s,d,c,z | in-place transpositon/copying | | |||
| ?omatcopy | s,d,c,z | out-of-place transpositon/copying | | |||
| ?geadd | s,d,c,z | matrix add | | |||
| ?gemmt | s,d,c,z | gemm but only a triangular part updated| | |||
| Routine | Data Types | Description | | |||
| ------------- |:------------- | :-----------------------------------------------| | |||
| ?axpby | s,d,c,z | like `axpy` with a multiplier for `y` | | |||
| ?gemm3m | c,z | `gemm3m` | | |||
| ?imatcopy | s,d,c,z | in-place transposition/copying | | |||
| ?omatcopy | s,d,c,z | out-of-place transposition/copying | | |||
| ?geadd | s,d,c,z | ATLAS-like matrix add `B = α*A+β*B` | | |||
| ?gemmt | s,d,c,z | `gemm` but only a triangular part updated | | |||
## bfloat16 functionality | |||
@@ -51,9 +51,9 @@ In practice, the values are derived by experimentation to yield the block sizes | |||
### <a name="reportbug"></a>How can I report a bug? | |||
Please file an issue at this [issue page](https://github.com/xianyi/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users). | |||
Please file an issue at this [issue page](https://github.com/OpenMathLib/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users). | |||
Please provide the following information: CPU, OS, compiler, and OpenBLAS compiling flags (Makefile.rule). In addition, please describe how to reproduce this bug. | |||
Please provide the following information: CPU, OS, compiler, OpenBLAS version and any compiling flags you used (Makefile.rule). In addition, please describe how to reproduce this bug. | |||
### <a name="publication"></a>How to reference OpenBLAS. | |||
@@ -99,13 +99,13 @@ Here is the result of the DGEMM subroutine's performance on Intel Core i5-2500K | |||
### <a name="MSVC"></a>How can I call an OpenBLAS function in Microsoft Visual Studio? | |||
Please read [this page](install.md#visual-studio). | |||
Please read [this page](install.md#visual-studio-native-windows-abi). | |||
### <a name="C99_complex_number"></a>How can I use CBLAS and LAPACKE without C99 complex number support (e.g. in Visual Studio)? | |||
Zaheer has fixed this bug. You can now use the structure instead of C99 complex numbers. Please read [this issue page](http://github.com/xianyi/OpenBLAS/issues/95) for details. | |||
[This issue](https://github.com/xianyi/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio. | |||
[This issue](https://github.com/OpenMathLib/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio. | |||
### <a name="Linux_SEGFAULT"></a>I get a SEGFAULT with multi-threading on Linux. What's wrong? | |||
@@ -134,6 +134,13 @@ Background: OpenBLAS implements optimized versions of some LAPACK functions, so | |||
Some of the LAPACK tests, notably in xeigtstz, try to allocate around 10MB on the stack. You may need to use | |||
`ulimit -s` to change the default limits on your system to allow this. | |||
### <a name="lapack_test"></a>My build worked fine and passed the BLAS tests, but running `make lapack-test` ends with a number of errors in the summary report | |||
The LAPACK tests were primarily created to test the validity of the Reference-LAPACK implementation, which is implemented in unoptimized, single-threaded Fortran code. This makes it very sensitive to small numerical deviations that can result from the use of specialized cpu instructions that combine multiplications and additions without intermediate rounding and storing to memory (FMA), or from changing the order of mathematical operations by splitting an original problem workload into smaller tasks that are solved in parallel. As a result, you may encounter a small number of errors in the "numerical" column of | |||
the summary table at the end of the `make lapack-test` run - this is usually nothing to worry about, and the exact number and distribution of errors among the | |||
four data types will often vary with the optimization flags you supplied to the compiler, or the cpu model for which you built OpenBLAS. Sporadic errors in the column labeled `other` are normally the sign of failed convergence of iterative diagonalizations for the same reasons just mentioned. A more detailed error report is stored in the file testing_results.txt - this should be consulted in case of doubt. Care should be taken if you encounter numerical errors in the hundreds, or `other` errors accompanied by the LAPACK error message "on entry to function_name parameter X had an illegal value" that signals a problem with argument passing between individual functions. | |||
(See also [this issue](https://github.com/OpenMathLib/OpenBLAS/issues/4032) in the issue tracker on github for additional discussion, examples and links) | |||
### <a name="no_affinity"></a>How could I disable OpenBLAS threading affinity on runtime? | |||
You can define the OPENBLAS_MAIN_FREE or GOTOBLAS_MAIN_FREE environment variable to disable threading affinity on runtime. For example, before the running, | |||
@@ -437,49 +437,72 @@ To then use the built OpenBLAS shared library in Visual Studio: | |||
[Qt Creator](http://qt.nokia.com/products/developer-tools/). | |||
#### Windows on Arm | |||
The following tools needs to be installed to build for Windows on Arm (WoA): | |||
- Clang for Windows on Arm. | |||
Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/). | |||
E.g: LLVM 12 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-12.0.0/LLVM-12.0.0-woa64.exe) | |||
Run the LLVM installer and ensure that LLVM is added to environment PATH. | |||
- Download and install classic Flang for Windows on Arm. | |||
Classic Flang is the only available Fortran compiler for Windows on Arm for now. | |||
A pre-release build can be found [here](https://github.com/kaadam/flang/releases/tag/v0.1) | |||
There is no installer for classic flang and the zip package can be | |||
extracted and the path needs to be added to environment `PATH`. | |||
E.g., in PowerShell: | |||
``` | |||
$env:Path += ";C:\flang_woa\bin" | |||
``` | |||
The following steps describe how to build the static library for OpenBLAS with and without LAPACK: | |||
1. Build OpenBLAS static library with BLAS and LAPACK routines with Make: | |||
```bash | |||
$ make CC="clang-cl" HOSTCC="clang-cl" AR="llvm-ar" BUILD_WITHOUT_LAPACK=0 NOFORTRAN=0 DYNAMIC_ARCH=0 TARGET=ARMV8 ARCH=arm64 BINARY=64 USE_OPENMP=0 PARALLEL=1 RANLIB="llvm-ranlib" MAKE=make F_COMPILER=FLANG FC=FLANG FFLAGS_NOOPT="-march=armv8-a -cpp" FFLAGS="-march=armv8-a -cpp" NEED_PIC=0 HOSTARCH=arm64 libs netlib | |||
``` | |||
2. Build static library with BLAS routines using CMake: | |||
Classic Flang has compatibility issues with CMake, hence only BLAS routines can be compiled with CMake: | |||
```bash | |||
$ mkdir build | |||
$ cd build | |||
$ cmake .. -G Ninja -DCMAKE_C_COMPILER=clang -DBUILD_WITHOUT_LAPACK=1 -DNOFORTRAN=1 -DDYNAMIC_ARCH=0 -DTARGET=ARMV8 -DARCH=arm64 -DBINARY=64 -DUSE_OPENMP=0 -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DCMAKE_CROSSCOMPILING=1 -DCMAKE_SYSTEM_NAME=Windows | |||
$ cmake --build . --config Release | |||
``` | |||
!!! tip "`getarch.exe` execution error" | |||
If you notice that platform-specific headers by `getarch.exe` are not | |||
generated correctly, this could be due to a known debug runtime DLL issue for | |||
arm64 platforms. Please check out [this page](https://linaro.atlassian.net/wiki/spaces/WOAR/pages/28677636097/Debug+run-time+DLL+issue#Workaround) | |||
for a workaround. | |||
### Windows on Arm | |||
A fully functional native OpenBLAS for WoA that can be built as both a static and dynamic library using LLVM toolchain and Visual Studio 2022. Before starting to build, make sure that you have installed Visual Studio 2022 on your ARM device, including the "Desktop Development with C++" component (that contains the cmake tool). | |||
(Note that you can use the free "Visual Studio 2022 Community Edition" for this task. In principle it would be possible to build with VisualStudio alone, but using | |||
the LLVM toolchain enables native compilation of the Fortran sources of LAPACK and of all the optimized assembly files, which VisualStudio cannot handle on its own) | |||
1. Clone OpenBLAS to your local machine and checkout to latest release of | |||
OpenBLAS (unless you want to build the latest development snapshot - here we | |||
are using the 0.3.28 release as the example, of course this exact version | |||
may be outdated by the time you read this) | |||
```cmd | |||
git clone https://github.com/OpenMathLib/OpenBLAS.git | |||
cd OpenBLAS | |||
git checkout v0.3.28 | |||
``` | |||
2. Install Latest LLVM toolchain for WoA: | |||
Download the Latest LLVM toolchain for WoA from [the Release | |||
page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At | |||
the time of writing, this is version 19.1.5 - be sure to select the | |||
latest release for which you can find a precompiled package whose name ends | |||
in "-woa64.exe" (precompiled packages usually lag a week or two behind their | |||
corresponding source release). Make sure to enable the option | |||
*“Add LLVM to the system PATH for all the users”*. | |||
Note: Make sure that the path of LLVM toolchain is at the top of Environment | |||
Variables section to avoid conflicts between the set of compilers available | |||
in the system path | |||
3. Launch the Native Command Prompt for Windows ARM64: | |||
From the start menu search for *"ARM64 Native Tools Command Prompt for Visual | |||
Studio 2022"*. Alternatively open command prompt, run the following command to | |||
activate the environment: | |||
```cmd | |||
C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat | |||
``` | |||
4. Navigate to the OpenBLAS source code directory and start building OpenBLAS | |||
by invoking Ninja: | |||
```cmd | |||
cd OpenBLAS | |||
mkdir build | |||
cd build | |||
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang-new | |||
ninja -j16 | |||
``` | |||
Note: You might want to include additional options in the cmake command | |||
here. For example, the default configuration only generates a | |||
`static.lib` version of the library. If you prefer a DLL, you can add | |||
`-DBUILD_SHARED_LIBS=ON`. | |||
Note that it is also possible to use the same setup to build OpenBLAS | |||
with Make, if you prefer Makefiles over the CMake build for some | |||
reason: | |||
```cmd | |||
$ make CC=clang-cl FC=flang-new AR="llvm-ar" TARGET=ARMV8 ARCH=arm64 RANLIB="llvm-ranlib" MAKE=make | |||
``` | |||
#### Generating an import library | |||
@@ -501,7 +524,7 @@ In your shell, move to this directory: `cd exports`. | |||
incompatibility in the C ABI would be a bug). | |||
The import libraries of MSVC have the suffix `.lib`. They are generated | |||
from a `.def` file using MSVC's `lib.exe`. See [the MSVC instructions](use_visual_studio.md#generate-import-library-before-0210-version). | |||
from a `.def` file using MSVC's `lib.exe`. | |||
=== "MinGW" | |||
@@ -532,7 +555,6 @@ In your shell, move to this directory: `cd exports`. | |||
To build OpenBLAS for Android, you will need the following tools installed on your machine: | |||
- [The Android NDK](https://developer.android.com/ndk/) | |||
- Perl | |||
- Clang compiler on the build machine | |||
The next two sections below describe how to build with Clang for ARMV7 and | |||
@@ -574,7 +596,9 @@ utility in the make command above, like so: | |||
AR=${NDK_BUNDLE_DIR}/toolchains/arm-linux-androideabi-4.9/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-gcc-ar | |||
``` | |||
otherwise you may get a linker error complaining like `malformed archive header | |||
name at 8` when the native macOS `ar` command was invoked instead. | |||
name at 8` when the native macOS `ar` command was invoked instead. Note that | |||
with recent NDK versions, the AR tool may be named `llvm-ar` rather than what | |||
is assumed above. | |||
#### Building for ARMV8 | |||
@@ -604,12 +628,17 @@ Note: for NDK 23b, something as simple as: | |||
export PATH=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/:$PATH | |||
make HOSTCC=gcc CC=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android31-clang ONLY_CBLAS=1 TARGET=ARMV8 | |||
``` | |||
appears to be sufficient on Linux. | |||
appears to be sufficient on Linux. On OSX, setting AR to the ar provided in the | |||
"bin" path of the NDK (probably `llvm-ar`) is also necessary. | |||
??? note "Alternative build script for 3 architectures" | |||
This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`, `X86`) and install them to `/opt/OpenBLAS/lib`. | |||
This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`, | |||
`X86`) and install them to `/opt/OpenBLAS/lib`. Of course you can also copy | |||
only the section that is of interest to you - also notice that the `AR=` | |||
line may need adapting to the name of the ar tool provided in your | |||
`$TOOLCHAIN/bin` - for example `llvm-ar` in some recent NDK versions. | |||
It was tested on macOS with NDK version 21.3.6528147. | |||
```bash | |||
@@ -680,6 +709,40 @@ make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||
Adjust `MIN_IOS_VERSION` as necessary for your installation. E.g., change the version number | |||
to the minimum iOS version you want to target and execute this file to build the library. | |||
### HarmonyOS | |||
For this target you will need the cross-compiler toolchain package by Huawei, | |||
which contains solutions for both Windows and Linux. Only the Linux-based | |||
toolchain has been tested so far, but the following instructions may apply | |||
similarly to Windows: | |||
Download [this HarmonyOS 4.1.1 SDK](https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz), | |||
or whatever newer version may be available in the future). Use `tar -xvf | |||
ohos-sdk-windows_linux_public.tar.gz` to unpack it somewhere on your system. | |||
This will create a folder named "ohos-sdk" with subfolders "linux" and | |||
"windows". In the linux one you will find a ZIP archive named | |||
`native-linux-x64-4.1.7.8-Release.zip` - you need to unzip this where you want | |||
to install the cross-compiler, for example in `/opt/ohos-sdk`. | |||
In the directory where you unpacked OpenBLAS, create a build directory for cmake, and change into it : | |||
```bash | |||
mkdir build | |||
cd build | |||
``` | |||
Use the version of `cmake` that came with the SDK, and specify the location of | |||
its toolchain file as a cmake option. Also set the build target for OpenBLAS to | |||
`ARMV8` and specify `NOFORTRAN=1` (at least as of version 4.1.1, the SDK | |||
contains no Fortran compiler): | |||
```bash | |||
/opt/ohos-sdk/linux/native/build-tools/cmake/bin/cmake \ | |||
-DCMAKE_TOOLCHAIN_FILE=/opt/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \ | |||
-DOHOS_ARCH="arm64-v8a" -DTARGET=ARMV8 -DNOFORTRAN=1 .. | |||
``` | |||
Additional other OpenBLAS build options like `USE_OPENMP=1` or `DYNAMIC_ARCH=1` | |||
will probably work too. Finally do the build: | |||
```bash | |||
/opt/ohos-sdk/linux/native/build-tools/cmake/bin/cmake --build . | |||
``` | |||
### MIPS | |||
@@ -0,0 +1,38 @@ | |||
OpenBLAS checks the following environment variables on startup: | |||
* `OPENBLAS_NUM_THREADS`: the number of threads to use (for non-OpenMP builds | |||
of OpenBLAS) | |||
* `OMP_NUM_THREADS`: the number of threads to use (for OpenMP builds - note | |||
that setting this may also affect any other OpenMP code) | |||
* `OPENBLAS_DEFAULT_NUM_THREADS`: the number of threads to use, irrespective if | |||
OpenBLAS was built for OpenMP or pthreads | |||
* `OPENBLAS_MAIN_FREE=1`: this can be used to disable automatic assignment of | |||
cpu affinity in OpenBLAS builds that have it enabled by default | |||
* `OPENBLAS_THREAD_TIMEOUT`: this can be used to define the length of time | |||
that idle threads should wait before exiting | |||
* `OMP_ADAPTIVE=1`: this can be used in OpenMP builds to actually remove any | |||
surplus threads when the number of threads is decreased | |||
`DYNAMIC_ARCH` builds also accept the following: | |||
* `OPENBLAS_VERBOSE`: | |||
- set this to `1` to enable a warning when there is no exact match for the | |||
detected cpu in the library | |||
- set this to `2` to make OpenBLAS print the name of the cpu target it | |||
autodetected | |||
* `OPENBLAS_CORETYPE`: set this to one of the supported target names to | |||
override autodetection, e.g., `OPENBLAS_CORETYPE=HASWELL` | |||
* `OPENBLAS_L2_SIZE`: set this to override the autodetected size of the L2 | |||
cache where it is not reported correctly (in virtual environments) | |||
Deprecated variables still recognized for compatibilty: | |||
* `GOTO_NUM_THREADS`: equivalent to `OPENBLAS_NUM_THREADS` | |||
* `GOTOBLAS_MAIN_FREE`: equivalent to `OPENBLAS_MAIN_FREE` | |||
* `OPENBLAS_BLOCK_FACTOR`: this applies a scale factor to the GEMM "P" | |||
parameter of the block matrix code, see file `driver/others/parameter.c` |
@@ -547,7 +547,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
#ifdef USE_OPENMP | |||
static omp_lock_t level3_lock, critical_section_lock; | |||
static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0, | |||
static volatile BLASULONG init_lock = 0, omp_lock_initialized = 0, | |||
parallel_section_left = MAX_PARALLEL_NUMBER; | |||
// Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c | |||
@@ -286,22 +286,59 @@ static gotoblas_t *get_coretype(void) { | |||
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { | |||
#ifdef __linux | |||
int i; | |||
int ncores=0; | |||
int prt,cpucap,cpulowperf=0,cpumidperf=0,cpuhiperf=0; | |||
FILE *infile; | |||
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; | |||
p = (char *) NULL ; | |||
infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r"); | |||
if (!infile) return NULL; | |||
(void)fgets(buffer, sizeof(buffer), infile); | |||
midr_el1=strtoul(buffer,NULL,16); | |||
fclose(infile); | |||
#else | |||
char buffer[512], *cpu_part = NULL, *cpu_implementer = NULL; | |||
infile = fopen("/sys/devices/system/cpu/possible","r"); | |||
if (infile) { | |||
(void)fgets(buffer, sizeof(buffer), infile); | |||
sscanf(buffer,"0-%d",&ncores); | |||
fclose (infile); | |||
ncores++; | |||
} else { | |||
infile = fopen("/proc/cpuinfo","r"); | |||
while (fgets(buffer, sizeof(buffer), infile)) { | |||
if (!strncmp("processor", buffer, 9)) | |||
ncores++; | |||
} | |||
} | |||
for (i=0;i<ncores;i++) { | |||
sprintf(buffer,"/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1",i); | |||
infile = fopen(buffer,"r"); | |||
if (!infile) return NULL; | |||
(void)fgets(buffer, sizeof(buffer), infile); | |||
midr_el1=strtoul(buffer,NULL,16); | |||
implementer = (midr_el1 >> 24) & 0xFF; | |||
prt = (midr_el1 >> 4) & 0xFFF; | |||
fclose(infile); | |||
sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capability",i); | |||
infile = fopen(buffer,"r"); | |||
if (infile) { | |||
(void)fgets(buffer, sizeof(buffer), infile); | |||
cpucap=strtoul(buffer,NULL,16); | |||
fclose(infile); | |||
if (cpucap >= 1000) cpuhiperf++; | |||
else if (cpucap >=500) cpumidperf++; | |||
else cpulowperf++; | |||
if (cpucap >=1000) part = prt; | |||
} else if (implementer == 0x41 ){ | |||
if (prt >= 0xd4b) cpuhiperf++; | |||
else if (prt>= 0xd07) cpumidperf++; | |||
else cpulowperf++; | |||
} else cpulowperf++; | |||
} | |||
if (!part) part = prt; | |||
#else | |||
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); | |||
openblas_warning(1, coremsg); | |||
return NULL; | |||
#endif | |||
} else { | |||
get_cpu_ftr(MIDR_EL1, midr_el1); | |||
} | |||
/* | |||
* MIDR_EL1 | |||
* | |||
@@ -312,7 +349,7 @@ static gotoblas_t *get_coretype(void) { | |||
*/ | |||
implementer = (midr_el1 >> 24) & 0xFF; | |||
part = (midr_el1 >> 4) & 0xFFF; | |||
} | |||
switch(implementer) | |||
{ | |||
case 0x41: // ARM | |||
@@ -2538,7 +2538,7 @@ static void *alloc_shm(void *address){ | |||
} | |||
#endif | |||
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS | |||
#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) | |||
static void alloc_hugetlb_free(struct release_t *release){ | |||
@@ -3254,7 +3254,7 @@ void blas_shutdown(void){ | |||
#endif | |||
newmemory[pos].lock = 0; | |||
} | |||
free(newmemory); | |||
free((void*)newmemory); | |||
newmemory = NULL; | |||
memory_overflowed = 0; | |||
} | |||
@@ -869,8 +869,12 @@ lapackobjs2z="$lapackobjs2z | |||
#functions added post 3.11 | |||
lapackobjs2c="$lapackobjs2c | |||
cgelst | |||
cgeqp3rk | |||
claqp2rk | |||
claqp3rk | |||
clatrs3 | |||
crscl | |||
ctrsyl3 | |||
" | |||
# claqz0 | |||
@@ -894,6 +898,16 @@ lapackobjs2d="$lapackobjs2d | |||
# dlaqz3 | |||
# dlaqz4 | |||
lapackobjs2s="$lapackobjs2s | |||
sgelst | |||
sgeqp3rk | |||
slaqp2rk | |||
slaqp3rk | |||
slarmm | |||
slatrs3 | |||
strsyl3 | |||
" | |||
lapackobjs2z="$lapackobjs2z | |||
zgelst | |||
zgeqp3rk | |||
@@ -245,6 +245,13 @@ else | |||
;; | |||
*flang*) | |||
vendor=FLANG | |||
data=`$compiler -v 2>&1 > /dev/null` | |||
v="${data#*version *}" | |||
v="${v%%*.}" | |||
major="${v%%.*}" | |||
if [ "$major" -ge 17 ]; then | |||
vendor=FLANGNEW | |||
fi | |||
bu=_ | |||
openmp='-fopenmp' | |||
;; | |||
@@ -109,7 +109,7 @@ endif () | |||
GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) | |||
# gemmtr is gemmt under the name adopted by the Reference BLAS | |||
GenerateNamedObjects("gemm.c" "" "gemmtr" ${CBLAS_FLAG}) | |||
GenerateNamedObjects("gemm.c" "RNAME" "gemmtr" ${CBLAS_FLAG}) | |||
# max and imax are compiled 4 times | |||
GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG}) | |||
@@ -126,7 +126,7 @@ if (BUILD_BFLOAT16) | |||
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("gemmt.c" "" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("gemmt.c" "RNAME" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
@@ -1304,9 +1304,9 @@ ifeq ($(BUILD_BFLOAT16),1) | |||
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
endif | |||
sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
@@ -1328,34 +1328,34 @@ xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
sgemmtr.$(SUFFIX) sgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
dgemmtr.$(SUFFIX) dgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
qgemmtr.$(SUFFIX) qgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
cgemmtr.$(SUFFIX) cgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
zgemmtr.$(SUFFIX) zgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
@@ -38,6 +38,17 @@ | |||
#ifndef COMPLEX | |||
#define SMP_THRESHOLD_MIN 65536.0 | |||
#ifdef RNAME | |||
#ifdef XDOUBLE | |||
#define ERROR_NAME "QGEMMTR" | |||
#elif defined(DOUBLE) | |||
#define ERROR_NAME "DGEMMTR" | |||
#elif defined(BFLOAT16) | |||
#define ERROR_NAME "SBGEMMTR" | |||
#else | |||
#define ERROR_NAME "SGEMMTR" | |||
#endif | |||
#else | |||
#ifdef XDOUBLE | |||
#define ERROR_NAME "QGEMMT " | |||
#elif defined(DOUBLE) | |||
@@ -47,8 +58,18 @@ | |||
#else | |||
#define ERROR_NAME "SGEMMT " | |||
#endif | |||
#endif | |||
#else | |||
#define SMP_THRESHOLD_MIN 8192.0 | |||
#ifdef RNAME | |||
#ifdef XDOUBLE | |||
#define ERROR_NAME "XGEMMTR" | |||
#elif defined(DOUBLE) | |||
#define ERROR_NAME "ZGEMMTR" | |||
#else | |||
#define ERROR_NAME "CGEMMTR" | |||
#endif | |||
#else | |||
#ifdef XDOUBLE | |||
#define ERROR_NAME "XGEMMT " | |||
#elif defined(DOUBLE) | |||
@@ -57,6 +78,7 @@ | |||
#define ERROR_NAME "CGEMMT " | |||
#endif | |||
#endif | |||
#endif | |||
#ifndef GEMM_MULTITHREAD_THRESHOLD | |||
#define GEMM_MULTITHREAD_THRESHOLD 4 | |||
@@ -63,6 +63,36 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT | |||
}; | |||
#endif | |||
#ifdef DYNAMIC_ARCH | |||
extern char* gotoblas_corename(void); | |||
#endif | |||
#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||
static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { | |||
return | |||
MN < 25600L ? 1 | |||
: MN < 63001L ? MIN(ncpu, 4) | |||
: MN < 459684L ? MIN(ncpu, 16) | |||
: ncpu; | |||
} | |||
#endif | |||
static inline int get_gemv_optimal_nthreads(BLASLONG MN) { | |||
int ncpu = num_cpu_avail(3); | |||
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||
} | |||
#endif | |||
if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||
return 1; | |||
else | |||
return num_cpu_avail(2); | |||
} | |||
#ifndef CBLAS | |||
void NAME(char *TRANS, blasint *M, blasint *N, | |||
@@ -225,11 +255,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
#ifdef SMP | |||
if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||
nthreads = 1; | |||
else | |||
nthreads = num_cpu_avail(2); | |||
nthreads = get_gemv_optimal_nthreads(1L * m * n); | |||
if (nthreads == 1) { | |||
#endif | |||
@@ -107,21 +107,33 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, | |||
#ifndef PPC440 | |||
buffer = (FLOAT *)blas_memory_alloc(1); | |||
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
#endif | |||
#ifdef SMP | |||
args.common = NULL; | |||
#ifndef DOUBLE | |||
if (args.m*args.n < 40000) | |||
#if defined(_WIN64) && defined(_M_ARM64) | |||
#ifdef COMPLEX | |||
if (args.m * args.n > 600) | |||
#else | |||
if (args.m * args.n > 1000) | |||
#endif | |||
args.nthreads = num_cpu_avail(4); | |||
else | |||
args.nthreads = 1; | |||
#else | |||
if (args.m*args.n < 10000) | |||
#ifndef DOUBLE | |||
if (args.m * args.n < 40000) | |||
#else | |||
if (args.m * args.n < 10000) | |||
#endif | |||
args.nthreads = 1; | |||
else | |||
args.nthreads = num_cpu_avail(4); | |||
#endif | |||
args.nthreads=1; | |||
else | |||
args.nthreads = num_cpu_avail(4); | |||
if (args.nthreads == 1) { | |||
#endif | |||
@@ -127,6 +127,9 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In | |||
#endif | |||
#ifdef SMP | |||
if (args.n <= 150) | |||
args.nthreads = 1; | |||
else | |||
args.nthreads = num_cpu_avail(4); | |||
if (args.nthreads == 1) { | |||
@@ -61,6 +61,37 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
#else | |||
return fabsf(x[0]); | |||
#endif | |||
#endif | |||
if (incx == 0) | |||
#ifndef COMPLEX | |||
#ifdef DOUBLE | |||
return (sqrt((double)n)*fabs(x[0])); | |||
#else | |||
return (sqrt((float)n)*fabsf(x[0])); | |||
#endif | |||
#else | |||
#ifdef DOUBLE | |||
{ | |||
double fr=fabs(x[0]); | |||
double fi=fabs(x[1]); | |||
double fmin=MIN(fr,fi); | |||
double fmax=MAX(fr,fi); | |||
if (fmax==0.) return(fmax); | |||
if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax); | |||
return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
} | |||
#else | |||
{ | |||
float fr=fabs(x[0]); | |||
float fi=fabs(x[1]); | |||
float fmin=MIN(fr,fi); | |||
float fmax=MAX(fr,fi); | |||
if (fmax==0.) return(fmax); | |||
if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax); | |||
return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
} | |||
#endif | |||
#endif | |||
if (incx < 0) | |||
@@ -97,13 +128,44 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
if (n <= 0) return 0.; | |||
#ifndef COMPLEX | |||
#ifndef COMPLEX | |||
if (n == 1) | |||
#ifdef DOUBLE | |||
return fabs(x[0]); | |||
#else | |||
return fabsf(x[0]); | |||
#endif | |||
#endif | |||
if (incx == 0) | |||
#ifndef COMPLEX | |||
#ifdef DOUBLE | |||
return (sqrt((double)n)*fabs(x[0])); | |||
#else | |||
return (sqrt((float)n)*fabsf(x[0])); | |||
#endif | |||
#else | |||
#ifdef DOUBLE | |||
{ | |||
double fr=fabs(x[0]); | |||
double fi=fabs(x[1]); | |||
double fmin=MIN(fr,fi); | |||
double fmax=MAX(fr,fi); | |||
if (fmax==0.) return(fmax); | |||
if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax); | |||
return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
} | |||
#else | |||
{ | |||
float fr=fabs(x[0]); | |||
float fi=fabs(x[1]); | |||
float fmin=MIN(fr,fi); | |||
float fmax=MAX(fr,fi); | |||
if (fmax==0.) return(fmax); | |||
if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax); | |||
return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
} | |||
#endif | |||
#endif | |||
if (incx < 0) | |||
@@ -7,149 +7,21 @@ | |||
void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ | |||
blasint n = *N; | |||
blasint incx = *INCX; | |||
blasint incy = *INCY; | |||
blasint n = *N; | |||
blasint incx = *INCX; | |||
blasint incy = *INCY; | |||
PRINT_DEBUG_NAME | |||
#else | |||
void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ | |||
#endif | |||
blasint i__1, i__2; | |||
PRINT_DEBUG_CNAME; | |||
blasint i__; | |||
FLOAT w, z__; | |||
blasint kx, ky; | |||
FLOAT dh11, dh12, dh22, dh21, dflag; | |||
blasint nsteps; | |||
#ifndef CBLAS | |||
PRINT_DEBUG_CNAME; | |||
#else | |||
PRINT_DEBUG_CNAME; | |||
#endif | |||
--dparam; | |||
--dy; | |||
--dx; | |||
dflag = dparam[1]; | |||
if (n <= 0 || dflag == - 2.0) goto L140; | |||
if (! (incx == incy && incx > 0)) goto L70; | |||
nsteps = n * incx; | |||
if (dflag < 0.) { | |||
goto L50; | |||
} else if (dflag == 0) { | |||
goto L10; | |||
} else { | |||
goto L30; | |||
} | |||
L10: | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
i__1 = nsteps; | |||
i__2 = incx; | |||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w + z__ * dh12; | |||
dy[i__] = w * dh21 + z__; | |||
/* L20: */ | |||
} | |||
goto L140; | |||
L30: | |||
dh11 = dparam[2]; | |||
dh22 = dparam[5]; | |||
i__2 = nsteps; | |||
i__1 = incx; | |||
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w * dh11 + z__; | |||
dy[i__] = -w + dh22 * z__; | |||
/* L40: */ | |||
} | |||
goto L140; | |||
L50: | |||
dh11 = dparam[2]; | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
dh22 = dparam[5]; | |||
i__1 = nsteps; | |||
i__2 = incx; | |||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w * dh11 + z__ * dh12; | |||
dy[i__] = w * dh21 + z__ * dh22; | |||
/* L60: */ | |||
} | |||
goto L140; | |||
L70: | |||
kx = 1; | |||
ky = 1; | |||
if (incx < 0) { | |||
kx = (1 - n) * incx + 1; | |||
} | |||
if (incy < 0) { | |||
ky = (1 - n) * incy + 1; | |||
} | |||
ROTM_K(n, dx, incx, dy, incy, dparam); | |||
if (dflag < 0.) { | |||
goto L120; | |||
} else if (dflag == 0) { | |||
goto L80; | |||
} else { | |||
goto L100; | |||
} | |||
L80: | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w + z__ * dh12; | |||
dy[ky] = w * dh21 + z__; | |||
kx += incx; | |||
ky += incy; | |||
/* L90: */ | |||
} | |||
goto L140; | |||
L100: | |||
dh11 = dparam[2]; | |||
dh22 = dparam[5]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w * dh11 + z__; | |||
dy[ky] = -w + dh22 * z__; | |||
kx += incx; | |||
ky += incy; | |||
/* L110: */ | |||
} | |||
goto L140; | |||
L120: | |||
dh11 = dparam[2]; | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
dh22 = dparam[5]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w * dh11 + z__ * dh12; | |||
dy[ky] = w * dh21 + z__ * dh22; | |||
kx += incx; | |||
ky += incy; | |||
/* L130: */ | |||
} | |||
L140: | |||
return; | |||
} | |||
@@ -336,6 +336,18 @@ ifndef XROTKERNEL | |||
XROTKERNEL = zrot.S | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = rotm.S | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = rotm.S | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = rotm.S | |||
endif | |||
### SCAL ### | |||
ifndef SSCALKERNEL | |||
@@ -504,21 +516,21 @@ SBLASOBJS += \ | |||
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | |||
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | |||
saxpby_k$(TSUFFIX).$(SUFFIX) | |||
saxpby_k$(TSUFFIX).$(SUFFIX) srotm_k$(TSUFFIX).$(SUFFIX) | |||
DBLASOBJS += \ | |||
damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ | |||
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | |||
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | |||
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | |||
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) drotm_k$(TSUFFIX).$(SUFFIX) | |||
QBLASOBJS += \ | |||
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | |||
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | |||
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | |||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | |||
qsum_k$(TSUFFIX).$(SUFFIX) | |||
qsum_k$(TSUFFIX).$(SUFFIX) qrotm_k$(TSUFFIX).$(SUFFIX) | |||
CBLASOBJS += \ | |||
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | |||
@@ -842,7 +854,16 @@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||
$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL) | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||
$(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL) | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||
$(KDIR)qrotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTMKERNEL) | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) | |||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ | |||
@@ -122,3 +122,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S | |||
ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S | |||
ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S | |||
ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -43,4 +43,14 @@ ifndef ZGEMM_BETA | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -45,4 +45,14 @@ ifndef ZGEMM_BETA | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -2,5 +2,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
SGEMVNKERNEL = gemv_n_sve.c | |||
DGEMVNKERNEL = gemv_n_sve.c | |||
SGEMVTKERNEL = gemv_t_sve.c | |||
DGEMVTKERNEL = gemv_t_sve.c | |||
SGEMVTKERNEL = gemv_t_sve_v4x3.c | |||
DGEMVTKERNEL = gemv_t_sve_v4x3.c |
@@ -64,8 +64,8 @@ DAXPYKERNEL = daxpy_thunderx2t99.S | |||
CAXPYKERNEL = zaxpy.S | |||
ZAXPYKERNEL = zaxpy.S | |||
SROTKERNEL = rot.S | |||
DROTKERNEL = rot.S | |||
SROTKERNEL = rot.c | |||
DROTKERNEL = rot.c | |||
CROTKERNEL = zrot.S | |||
ZROTKERNEL = zrot.S | |||
@@ -94,8 +94,8 @@ DCOPYKERNEL = copy_thunderx2t99.c | |||
CCOPYKERNEL = copy_thunderx2t99.c | |||
ZCOPYKERNEL = copy_thunderx2t99.c | |||
SSWAPKERNEL = swap_thunderx2t99.S | |||
DSWAPKERNEL = swap_thunderx2t99.S | |||
SSWAPKERNEL = swap.c | |||
DSWAPKERNEL = swap.c | |||
CSWAPKERNEL = swap_thunderx2t99.S | |||
ZSWAPKERNEL = swap_thunderx2t99.S | |||
@@ -104,10 +104,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c | |||
ICAMAXKERNEL = izamax_thunderx2t99.c | |||
IZAMAXKERNEL = izamax_thunderx2t99.c | |||
SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
SNRM2KERNEL = nrm2.S | |||
DNRM2KERNEL = nrm2.S | |||
CNRM2KERNEL = znrm2.S | |||
ZNRM2KERNEL = znrm2.S | |||
DDOTKERNEL = dot.c | |||
SDOTKERNEL = dot.c | |||
@@ -98,8 +98,18 @@ ZNRM2KERNEL = znrm2.S | |||
DDOTKERNEL = dot.c | |||
SDOTKERNEL = dot.c | |||
ifeq ($(OSNAME), WINNT) | |||
ifeq ($(C_COMPILER), CLANG) | |||
CDOTKERNEL = zdot.S | |||
ZDOTKERNEL = zdot.S | |||
else | |||
CDOTKERNEL = zdot_thunderx2t99.c | |||
ZDOTKERNEL = zdot_thunderx2t99.c | |||
endif | |||
else | |||
CDOTKERNEL = zdot_thunderx2t99.c | |||
ZDOTKERNEL = zdot_thunderx2t99.c | |||
endif | |||
DSDOTKERNEL = dot.S | |||
DGEMM_BETA = dgemm_beta.S | |||
@@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c | |||
ICAMAXKERNEL = izamax_thunderx2t99.c | |||
IZAMAXKERNEL = izamax_thunderx2t99.c | |||
SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
SNRM2KERNEL = nrm2.S | |||
DNRM2KERNEL = nrm2.S | |||
CNRM2KERNEL = znrm2.S | |||
ZNRM2KERNEL = znrm2.S | |||
DDOTKERNEL = dot.c | |||
SDOTKERNEL = dot.c | |||
@@ -1,4 +1,18 @@ | |||
include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
SGEMVTKERNEL = gemv_t_sve.c | |||
DGEMVTKERNEL = gemv_t_sve.c | |||
SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
ifeq ($(BUILD_BFLOAT16), 1) | |||
SBGEMM_BETA = sbgemm_beta_neoversev1.c | |||
SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversev1.c | |||
ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) | |||
SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c | |||
SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c | |||
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c | |||
SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c | |||
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
endif |
@@ -171,3 +171,15 @@ QCABS_KERNEL = ../generic/cabs.c | |||
#Dump kernel | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -1,216 +1,217 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2017, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
#define N "x0" /* vector length */ | |||
#define X "x1" /* X vector address */ | |||
#define INC_X "x2" /* X stride */ | |||
#define Y "x3" /* Y vector address */ | |||
#define INC_Y "x4" /* Y stride */ | |||
#define J "x5" /* loop variable */ | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
#if !defined(COMPLEX) | |||
#if !defined(DOUBLE) | |||
#define TMPF "s0" | |||
#define INC_SHIFT "2" | |||
#define N_DIV_SHIFT "2" | |||
#define N_REM_MASK "3" | |||
#else | |||
#define TMPF "d0" | |||
#define INC_SHIFT "3" | |||
#define N_DIV_SHIFT "1" | |||
#define N_REM_MASK "1" | |||
#endif | |||
#else | |||
#if !defined(DOUBLE) | |||
#define TMPF "d0" | |||
#define INC_SHIFT "3" | |||
#define N_DIV_SHIFT "1" | |||
#define N_REM_MASK "1" | |||
#else | |||
#define TMPF "q0" | |||
#define INC_SHIFT "4" | |||
#define N_DIV_SHIFT "0" | |||
#define N_REM_MASK "0" | |||
#endif | |||
#endif | |||
#define KERNEL_F1 \ | |||
"ldr "TMPF", ["X"] \n" \ | |||
"add "X", "X", "INC_X" \n" \ | |||
"str "TMPF", ["Y"] \n" \ | |||
"add "Y", "Y", "INC_Y" \n" | |||
#define KERNEL_F \ | |||
"ldr q0, ["X"], #16 \n" \ | |||
"str q0, ["Y"], #16 \n" | |||
#define INIT \ | |||
"lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ | |||
"lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" | |||
static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
if ( n < 0 ) return 0; | |||
__asm__ __volatile__ ( | |||
" mov "N", %[N_] \n" | |||
" mov "X", %[X_] \n" | |||
" mov "INC_X", %[INCX_] \n" | |||
" mov "Y", %[Y_] \n" | |||
" mov "INC_Y", %[INCY_] \n" | |||
" cmp "N", xzr \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne 4f //copy_kernel_S_BEGIN \n" | |||
" cmp "INC_Y", #1 \n" | |||
" bne 4f //copy_kernel_S_BEGIN \n" | |||
"// .Lcopy_kernel_F_BEGIN: \n" | |||
" "INIT" \n" | |||
" asr "J", "N", #"N_DIV_SHIFT" \n" | |||
" cmp "J", xzr \n" | |||
" beq 2f //copy_kernel_F1 \n" | |||
" .align 5 \n" | |||
"1: //copy_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 1b //copy_kernel_F \n" | |||
"2: //copy_kernel_F1: \n" | |||
#if defined(COMPLEX) && defined(DOUBLE) | |||
" b 8f //copy_kernel_L999 \n" | |||
#else | |||
" ands "J", "N", #"N_REM_MASK" \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
#endif | |||
"3: //copy_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 3b //copy_kernel_F10 \n" | |||
" b 8f //copy_kernel_L999 \n" | |||
"4: //copy_kernel_S_BEGIN: \n" | |||
" "INIT" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble 6f //copy_kernel_S1 \n" | |||
"5: //copy_kernel_S4: \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 5b //copy_kernel_S4 \n" | |||
"6: //copy_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
"7: //copy_kernel_S10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 7b //copy_kernel_S10 \n" | |||
"8: //copy_kernel_L999: \n" | |||
: | |||
: [N_] "r" (n), //%1 | |||
[X_] "r" (x), //%2 | |||
[INCX_] "r" (inc_x), //%3 | |||
[Y_] "r" (y), //%4 | |||
[INCY_] "r" (inc_y) //%5 | |||
: "cc", | |||
"memory", | |||
"x0", "x1", "x2", "x3", "x4", "x5", | |||
"d0" | |||
); | |||
return 0; | |||
} | |||
#if defined(SMP) | |||
static int copy_thread_function(BLASLONG n, BLASLONG dummy0, | |||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) | |||
{ | |||
do_copy(n, x, inc_x, y, inc_y); | |||
return 0; | |||
} | |||
#endif | |||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
#if defined(SMP) | |||
int nthreads; | |||
FLOAT dummy_alpha; | |||
#endif | |||
if (n <= 0) return 0; | |||
#if defined(SMP) | |||
if (inc_x == 0 || n <= 10000) | |||
nthreads = 1; | |||
else | |||
nthreads = num_cpu_avail(1); | |||
if (nthreads == 1) { | |||
do_copy(n, x, inc_x, y, inc_y); | |||
} else { | |||
int mode = 0; | |||
#if !defined(COMPLEX) | |||
mode = BLAS_REAL; | |||
#else | |||
mode = BLAS_COMPLEX; | |||
#endif | |||
#if !defined(DOUBLE) | |||
mode |= BLAS_SINGLE; | |||
#else | |||
mode |= BLAS_DOUBLE; | |||
#endif | |||
blas_level1_thread(mode, n, 0, 0, &dummy_alpha, | |||
x, inc_x, y, inc_y, NULL, 0, | |||
( void *)copy_thread_function, nthreads); | |||
} | |||
#else | |||
do_copy(n, x, inc_x, y, inc_y); | |||
#endif | |||
return 0; | |||
} | |||
/*************************************************************************** | |||
Copyright (c) 2017, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
#define N "x0" /* vector length */ | |||
#define X "x1" /* X vector address */ | |||
#define INC_X "x2" /* X stride */ | |||
#define Y "x3" /* Y vector address */ | |||
#define INC_Y "x4" /* Y stride */ | |||
#define J "x5" /* loop variable */ | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
#if !defined(COMPLEX) | |||
#if !defined(DOUBLE) | |||
#define TMPF "s0" | |||
#define INC_SHIFT "2" | |||
#define N_DIV_SHIFT "2" | |||
#define N_REM_MASK "3" | |||
#else | |||
#define TMPF "d0" | |||
#define INC_SHIFT "3" | |||
#define N_DIV_SHIFT "1" | |||
#define N_REM_MASK "1" | |||
#endif | |||
#else | |||
#if !defined(DOUBLE) | |||
#define TMPF "d0" | |||
#define INC_SHIFT "3" | |||
#define N_DIV_SHIFT "1" | |||
#define N_REM_MASK "1" | |||
#else | |||
#define TMPF "q0" | |||
#define INC_SHIFT "4" | |||
#define N_DIV_SHIFT "0" | |||
#define N_REM_MASK "0" | |||
#endif | |||
#endif | |||
#define KERNEL_F1 \ | |||
"ldr "TMPF", ["X"] \n" \ | |||
"add "X", "X", "INC_X" \n" \ | |||
"str "TMPF", ["Y"] \n" \ | |||
"add "Y", "Y", "INC_Y" \n" | |||
#define KERNEL_F \ | |||
"ldr q0, ["X"], #16 \n" \ | |||
"str q0, ["Y"], #16 \n" | |||
#define INIT \ | |||
"lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ | |||
"lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" | |||
static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
if ( n < 0 ) return 0; | |||
__asm__ __volatile__ ( | |||
" mov "N", %[N_] \n" | |||
" mov "X", %[X_] \n" | |||
" mov "INC_X", %[INCX_] \n" | |||
" mov "Y", %[Y_] \n" | |||
" mov "INC_Y", %[INCY_] \n" | |||
" cmp "N", xzr \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne 4f //copy_kernel_S_BEGIN \n" | |||
" cmp "INC_Y", #1 \n" | |||
" bne 4f //copy_kernel_S_BEGIN \n" | |||
"// .Lcopy_kernel_F_BEGIN: \n" | |||
" "INIT" \n" | |||
" asr "J", "N", #"N_DIV_SHIFT" \n" | |||
" cmp "J", xzr \n" | |||
" beq 2f //copy_kernel_F1 \n" | |||
#if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
" .align 5 \n" | |||
#endif | |||
"1: //copy_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 1b //copy_kernel_F \n" | |||
"2: //copy_kernel_F1: \n" | |||
#if defined(COMPLEX) && defined(DOUBLE) | |||
" b 8f //copy_kernel_L999 \n" | |||
#else | |||
" ands "J", "N", #"N_REM_MASK" \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
#endif | |||
"3: //copy_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 3b //copy_kernel_F10 \n" | |||
" b 8f //copy_kernel_L999 \n" | |||
"4: //copy_kernel_S_BEGIN: \n" | |||
" "INIT" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble 6f //copy_kernel_S1 \n" | |||
"5: //copy_kernel_S4: \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 5b //copy_kernel_S4 \n" | |||
"6: //copy_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
"7: //copy_kernel_S10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 7b //copy_kernel_S10 \n" | |||
"8: //copy_kernel_L999: \n" | |||
: | |||
: [N_] "r" (n), //%1 | |||
[X_] "r" (x), //%2 | |||
[INCX_] "r" (inc_x), //%3 | |||
[Y_] "r" (y), //%4 | |||
[INCY_] "r" (inc_y) //%5 | |||
: "cc", | |||
"memory", | |||
"x0", "x1", "x2", "x3", "x4", "x5", | |||
"d0" | |||
); | |||
return 0; | |||
} | |||
#if defined(SMP) | |||
static int copy_thread_function(BLASLONG n, BLASLONG dummy0, | |||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) | |||
{ | |||
do_copy(n, x, inc_x, y, inc_y); | |||
return 0; | |||
} | |||
#endif | |||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
#if defined(SMP) | |||
int nthreads; | |||
FLOAT dummy_alpha; | |||
#endif | |||
if (n <= 0) return 0; | |||
#if defined(SMP) | |||
if (inc_x == 0 || n <= 10000) | |||
nthreads = 1; | |||
else | |||
nthreads = num_cpu_avail(1); | |||
if (nthreads == 1) { | |||
do_copy(n, x, inc_x, y, inc_y); | |||
} else { | |||
int mode = 0; | |||
#if !defined(COMPLEX) | |||
mode = BLAS_REAL; | |||
#else | |||
mode = BLAS_COMPLEX; | |||
#endif | |||
#if !defined(DOUBLE) | |||
mode |= BLAS_SINGLE; | |||
#else | |||
mode |= BLAS_DOUBLE; | |||
#endif | |||
blas_level1_thread(mode, n, 0, 0, &dummy_alpha, | |||
x, inc_x, y, inc_y, NULL, 0, | |||
( void *)copy_thread_function, nthreads); | |||
} | |||
#else | |||
do_copy(n, x, inc_x, y, inc_y); | |||
#endif | |||
return 0; | |||
} |
@@ -152,7 +152,9 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" cmp "J", xzr \n" | |||
" beq 3f //asum_kernel_F1 \n" | |||
#if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
".align 5 \n" | |||
#endif | |||
"2: //asum_kernel_F32: \n" | |||
" "KERNEL_F32" \n" | |||
" subs "J", "J", #1 \n" | |||
@@ -213,7 +213,7 @@ CNAME(BLASLONG M, | |||
const BLASLONG n2 = N & -2; | |||
const BLASLONG n8 = N & -8; | |||
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||
FLOAT* packed_a = | |||
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
@@ -219,7 +219,7 @@ CNAME(BLASLONG M, | |||
const BLASLONG n4 = N & -4; | |||
const BLASLONG n2 = N & -2; | |||
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||
FLOAT* packed_a = | |||
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
@@ -285,8 +285,9 @@ static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT | |||
" asr %[J_], %[N_], #"N_DIV_SHIFT" \n" | |||
" cmp %[J_], xzr \n" | |||
" beq 3f //dot_kernel_F1 \n" | |||
#if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
" .align 5 \n" | |||
#endif | |||
"2: //dot_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs %[J_], %[J_], #1 \n" | |||
@@ -0,0 +1,168 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2024, 2025 The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#ifdef DOUBLE | |||
#define SV_COUNT svcntd | |||
#define SV_TYPE svfloat64_t | |||
#define SV_TRUE svptrue_b64 | |||
#define SV_WHILE svwhilelt_b64_s64 | |||
#define SV_DUP svdup_f64 | |||
#else | |||
#define SV_COUNT svcntw | |||
#define SV_TYPE svfloat32_t | |||
#define SV_TRUE svptrue_b32 | |||
#define SV_WHILE svwhilelt_b32_s64 | |||
#define SV_DUP svdup_f32 | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
FLOAT *buffer) | |||
{ | |||
BLASLONG i; | |||
BLASLONG ix,iy; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT *y_ptr; | |||
FLOAT temp; | |||
iy = 0; | |||
if (inc_x == 1) { | |||
BLASLONG width = n / 3; | |||
BLASLONG sve_size = SV_COUNT(); | |||
svbool_t pg_true = SV_TRUE(); | |||
svbool_t pg = SV_WHILE(0, m % sve_size); | |||
FLOAT *a0_ptr = a + lda * width * 0; | |||
FLOAT *a1_ptr = a + lda * width * 1; | |||
FLOAT *a2_ptr = a + lda * width * 2; | |||
FLOAT *y0_ptr = y + inc_y * width * 0; | |||
FLOAT *y1_ptr = y + inc_y * width * 1; | |||
FLOAT *y2_ptr = y + inc_y * width * 2; | |||
for (j = 0; j < width; j++) { | |||
SV_TYPE temp00_vec = SV_DUP(0.0); | |||
SV_TYPE temp01_vec = SV_DUP(0.0); | |||
SV_TYPE temp02_vec = SV_DUP(0.0); | |||
i = 0; | |||
while ((i + sve_size * 1 - 1) < m) { | |||
SV_TYPE x0_vec = svld1(pg_true, x + i); | |||
SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); | |||
SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); | |||
SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); | |||
temp00_vec = svmla_x(pg_true, temp00_vec, a00_vec, x0_vec); | |||
temp01_vec = svmla_x(pg_true, temp01_vec, a01_vec, x0_vec); | |||
temp02_vec = svmla_x(pg_true, temp02_vec, a02_vec, x0_vec); | |||
i += sve_size * 1; | |||
} | |||
if (i < m) { | |||
SV_TYPE x0_vec = svld1(pg, x + i); | |||
SV_TYPE a00_vec = svld1(pg, a0_ptr + i); | |||
SV_TYPE a01_vec = svld1(pg, a1_ptr + i); | |||
SV_TYPE a02_vec = svld1(pg, a2_ptr + i); | |||
temp00_vec = svmla_m(pg, temp00_vec, a00_vec, x0_vec); | |||
temp01_vec = svmla_m(pg, temp01_vec, a01_vec, x0_vec); | |||
temp02_vec = svmla_m(pg, temp02_vec, a02_vec, x0_vec); | |||
} | |||
y0_ptr[iy] += alpha * svaddv(pg_true, temp00_vec); | |||
y1_ptr[iy] += alpha * svaddv(pg_true, temp01_vec); | |||
y2_ptr[iy] += alpha * svaddv(pg_true, temp02_vec); | |||
iy += inc_y; | |||
a0_ptr += lda; | |||
a1_ptr += lda; | |||
a2_ptr += lda; | |||
} | |||
a_ptr = a2_ptr; | |||
y_ptr = y2_ptr; | |||
for (j = width * 3; j < n; j++) { | |||
SV_TYPE temp_vec = SV_DUP(0.0); | |||
i = 0; | |||
while ((i + sve_size * 1 - 1) < m) { | |||
SV_TYPE x_vec = svld1(pg_true, x + i); | |||
SV_TYPE a_vec = svld1(pg_true, a_ptr + i); | |||
temp_vec = svmla_x(pg_true, temp_vec, a_vec, x_vec); | |||
i += sve_size * 1; | |||
} | |||
if (i < m) { | |||
SV_TYPE x_vec = svld1(pg, x + i); | |||
SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||
temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec); | |||
} | |||
y_ptr[iy] += alpha * svaddv(pg_true, temp_vec); | |||
iy += inc_y; | |||
a_ptr += lda; | |||
} | |||
return(0); | |||
} | |||
a_ptr = a; | |||
for (j = 0; j < n; j++) { | |||
temp = 0.0; | |||
ix = 0; | |||
for (i = 0; i < m; i++) { | |||
temp += a_ptr[i] * x[ix]; | |||
ix += inc_x; | |||
} | |||
y[iy] += alpha * temp; | |||
iy += inc_y; | |||
a_ptr += lda; | |||
} | |||
return(0); | |||
} |
@@ -0,0 +1,234 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#ifdef DOUBLE | |||
#define SV_COUNT svcntd | |||
#define SV_TYPE svfloat64_t | |||
#define SV_TRUE svptrue_b64 | |||
#define SV_WHILE svwhilelt_b64_s64 | |||
#define SV_DUP svdup_f64 | |||
#else | |||
#define SV_COUNT svcntw | |||
#define SV_TYPE svfloat32_t | |||
#define SV_TRUE svptrue_b32 | |||
#define SV_WHILE svwhilelt_b32_s64 | |||
#define SV_DUP svdup_f32 | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
FLOAT *buffer) | |||
{ | |||
BLASLONG i; | |||
BLASLONG ix,iy; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT temp; | |||
iy = 0; | |||
if (inc_x == 1) { | |||
BLASLONG width = (n + 3 - 1) / 3; | |||
FLOAT *a0_ptr = a + lda * width * 0; | |||
FLOAT *a1_ptr = a + lda * width * 1; | |||
FLOAT *a2_ptr = a + lda * width * 2; | |||
FLOAT *y0_ptr = y + inc_y * width * 0; | |||
FLOAT *y1_ptr = y + inc_y * width * 1; | |||
FLOAT *y2_ptr = y + inc_y * width * 2; | |||
for (j = 0; j < width; j++) { | |||
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
SV_TYPE temp00_vec = SV_DUP(0.0); | |||
SV_TYPE temp10_vec = SV_DUP(0.0); | |||
SV_TYPE temp20_vec = SV_DUP(0.0); | |||
SV_TYPE temp30_vec = SV_DUP(0.0); | |||
SV_TYPE temp01_vec = SV_DUP(0.0); | |||
SV_TYPE temp11_vec = SV_DUP(0.0); | |||
SV_TYPE temp21_vec = SV_DUP(0.0); | |||
SV_TYPE temp31_vec = SV_DUP(0.0); | |||
SV_TYPE temp02_vec = SV_DUP(0.0); | |||
SV_TYPE temp12_vec = SV_DUP(0.0); | |||
SV_TYPE temp22_vec = SV_DUP(0.0); | |||
SV_TYPE temp32_vec = SV_DUP(0.0); | |||
i = 0; | |||
BLASLONG sve_size = SV_COUNT(); | |||
while ((i + sve_size * 4 - 1) < m) { | |||
SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); | |||
SV_TYPE x1_vec = svld1_vnum(SV_TRUE(), x + i, 1); | |||
SV_TYPE x2_vec = svld1_vnum(SV_TRUE(), x + i, 2); | |||
SV_TYPE x3_vec = svld1_vnum(SV_TRUE(), x + i, 3); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||
temp10_vec = svmla_m(pg10, temp10_vec, a10_vec, x1_vec); | |||
temp20_vec = svmla_m(pg20, temp20_vec, a20_vec, x2_vec); | |||
temp30_vec = svmla_m(pg30, temp30_vec, a30_vec, x3_vec); | |||
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||
temp11_vec = svmla_m(pg11, temp11_vec, a11_vec, x1_vec); | |||
temp21_vec = svmla_m(pg21, temp21_vec, a21_vec, x2_vec); | |||
temp31_vec = svmla_m(pg31, temp31_vec, a31_vec, x3_vec); | |||
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||
temp12_vec = svmla_m(pg12, temp12_vec, a12_vec, x1_vec); | |||
temp22_vec = svmla_m(pg22, temp22_vec, a22_vec, x2_vec); | |||
temp32_vec = svmla_m(pg32, temp32_vec, a32_vec, x3_vec); | |||
i += sve_size * 4; | |||
} | |||
if (i < m) { | |||
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); | |||
svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); | |||
svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); | |||
pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
pg10 = svand_z(SV_TRUE(), pg1, pg10); | |||
pg20 = svand_z(SV_TRUE(), pg2, pg20); | |||
pg30 = svand_z(SV_TRUE(), pg3, pg30); | |||
pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
pg11 = svand_z(SV_TRUE(), pg1, pg11); | |||
pg21 = svand_z(SV_TRUE(), pg2, pg21); | |||
pg31 = svand_z(SV_TRUE(), pg3, pg31); | |||
pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
pg12 = svand_z(SV_TRUE(), pg1, pg12); | |||
pg22 = svand_z(SV_TRUE(), pg2, pg22); | |||
pg32 = svand_z(SV_TRUE(), pg3, pg32); | |||
SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); | |||
SV_TYPE x1_vec = svld1_vnum(pg1, x + i, 1); | |||
SV_TYPE x2_vec = svld1_vnum(pg2, x + i, 2); | |||
SV_TYPE x3_vec = svld1_vnum(pg3, x + i, 3); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||
temp10_vec = svmla_m(pg10, temp10_vec, a10_vec, x1_vec); | |||
temp20_vec = svmla_m(pg20, temp20_vec, a20_vec, x2_vec); | |||
temp30_vec = svmla_m(pg30, temp30_vec, a30_vec, x3_vec); | |||
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||
temp11_vec = svmla_m(pg11, temp11_vec, a11_vec, x1_vec); | |||
temp21_vec = svmla_m(pg21, temp21_vec, a21_vec, x2_vec); | |||
temp31_vec = svmla_m(pg31, temp31_vec, a31_vec, x3_vec); | |||
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||
temp12_vec = svmla_m(pg12, temp12_vec, a12_vec, x1_vec); | |||
temp22_vec = svmla_m(pg22, temp22_vec, a22_vec, x2_vec); | |||
temp32_vec = svmla_m(pg32, temp32_vec, a32_vec, x3_vec); | |||
} | |||
temp00_vec = svadd_x(SV_TRUE(), temp00_vec, temp10_vec); | |||
temp01_vec = svadd_x(SV_TRUE(), temp01_vec, temp11_vec); | |||
temp02_vec = svadd_x(SV_TRUE(), temp02_vec, temp12_vec); | |||
temp20_vec = svadd_x(SV_TRUE(), temp20_vec, temp30_vec); | |||
temp21_vec = svadd_x(SV_TRUE(), temp21_vec, temp31_vec); | |||
temp22_vec = svadd_x(SV_TRUE(), temp22_vec, temp32_vec); | |||
temp00_vec = svadd_x(SV_TRUE(), temp00_vec, temp20_vec); | |||
temp01_vec = svadd_x(SV_TRUE(), temp01_vec, temp21_vec); | |||
temp02_vec = svadd_x(SV_TRUE(), temp02_vec, temp22_vec); | |||
if ((j + width * 0) < n) { | |||
temp = svaddv(SV_TRUE(), temp00_vec); | |||
y0_ptr[iy] += alpha * temp; | |||
} | |||
if ((j + width * 1) < n) { | |||
temp = svaddv(SV_TRUE(), temp01_vec); | |||
y1_ptr[iy] += alpha * temp; | |||
} | |||
if ((j + width * 2) < n) { | |||
temp = svaddv(SV_TRUE(), temp02_vec); | |||
y2_ptr[iy] += alpha * temp; | |||
} | |||
iy += inc_y; | |||
a0_ptr += lda; | |||
a1_ptr += lda; | |||
a2_ptr += lda; | |||
} | |||
return(0); | |||
} | |||
a_ptr = a; | |||
for (j = 0; j < n; j++) { | |||
temp = 0.0; | |||
ix = 0; | |||
for (i = 0; i < m; i++) { | |||
temp += a_ptr[i] * x[ix]; | |||
ix += inc_x; | |||
} | |||
y[iy] += alpha * temp; | |||
iy += inc_y; | |||
a_ptr += lda; | |||
} | |||
return(0); | |||
} |
@@ -0,0 +1,40 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include "common.h" | |||
#include "rot_kernel_sve.c" | |||
#include "rot_kernel_c.c" | |||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
{ | |||
if (n <= 0) | |||
return (0); | |||
if (inc_x == 1 && inc_y == 1) | |||
rot_kernel_sve(n, x, y, c, s); | |||
else | |||
rot_kernel_c(n, x, inc_x, y, inc_y, c, s); | |||
return (0); | |||
} |
@@ -0,0 +1,44 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include "common.h" | |||
static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
{ | |||
BLASLONG i = 0; | |||
BLASLONG ix = 0, iy = 0; | |||
FLOAT temp; | |||
while (i < n) | |||
{ | |||
temp = c * x[ix] + s * y[iy]; | |||
y[iy] = c * y[iy] - s * x[ix]; | |||
x[ix] = temp; | |||
ix += inc_x; | |||
iy += inc_y; | |||
i++; | |||
} | |||
return (0); | |||
} |
@@ -0,0 +1,59 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
#ifdef DOUBLE | |||
#define SVE_TYPE svfloat64_t | |||
#define SVE_ZERO svdup_f64(0.0) | |||
#define SVE_WHILELT svwhilelt_b64 | |||
#define SVE_ALL svptrue_b64() | |||
#define SVE_WIDTH svcntd() | |||
#else | |||
#define SVE_TYPE svfloat32_t | |||
#define SVE_ZERO svdup_f32(0.0) | |||
#define SVE_WHILELT svwhilelt_b32 | |||
#define SVE_ALL svptrue_b32() | |||
#define SVE_WIDTH svcntw() | |||
#endif | |||
static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) | |||
{ | |||
for (BLASLONG i = 0; i < n; i += SVE_WIDTH) | |||
{ | |||
svbool_t pg = SVE_WHILELT((uint64_t)i, (uint64_t)n); | |||
SVE_TYPE x_vec = svld1(pg, &x[i]); | |||
SVE_TYPE y_vec = svld1(pg, &y[i]); | |||
SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); | |||
SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); | |||
SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); | |||
SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); | |||
svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); | |||
svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); | |||
} | |||
return (0); | |||
} |
@@ -153,8 +153,9 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" asr "J", "N", #6 \n" | |||
" cmp "J", xzr \n" | |||
" beq 3f //asum_kernel_F1 \n" | |||
#if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
".align 5 \n" | |||
#endif | |||
"2: //asum_kernel_F64: \n" | |||
" "KERNEL_F64" \n" | |||
" subs "J", "J", #1 \n" | |||
@@ -0,0 +1,83 @@ | |||
/*************************************************************************** | |||
* Copyright (c) 2024, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
* POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, IFLOAT *dummy2, | |||
BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, | |||
BLASLONG ldc) { | |||
BLASLONG i, j; | |||
BLASLONG chunk, remain; | |||
FLOAT *c_offset1, *c_offset; | |||
c_offset = c; | |||
chunk = m >> 3; | |||
remain = m & 7; | |||
if (beta == ZERO) { | |||
for (j = n; j > 0; j--) { | |||
c_offset1 = c_offset; | |||
c_offset += ldc; | |||
for (i = chunk; i > 0; i--) { | |||
*(c_offset1 + 0) = ZERO; | |||
*(c_offset1 + 1) = ZERO; | |||
*(c_offset1 + 2) = ZERO; | |||
*(c_offset1 + 3) = ZERO; | |||
*(c_offset1 + 4) = ZERO; | |||
*(c_offset1 + 5) = ZERO; | |||
*(c_offset1 + 6) = ZERO; | |||
*(c_offset1 + 7) = ZERO; | |||
c_offset1 += 8; | |||
} | |||
for (i = remain; i > 0; i--) { | |||
*c_offset1 = ZERO; | |||
c_offset1++; | |||
} | |||
} | |||
} else { | |||
for (j = n; j > 0; j--) { | |||
c_offset1 = c_offset; | |||
c_offset += ldc; | |||
for (i = chunk; i > 0; i--) { | |||
*(c_offset1 + 0) *= beta; | |||
*(c_offset1 + 1) *= beta; | |||
*(c_offset1 + 2) *= beta; | |||
*(c_offset1 + 3) *= beta; | |||
*(c_offset1 + 4) *= beta; | |||
*(c_offset1 + 5) *= beta; | |||
*(c_offset1 + 6) *= beta; | |||
*(c_offset1 + 7) *= beta; | |||
c_offset1 += 8; | |||
} | |||
for (i = remain; i > 0; i--) { | |||
*c_offset1 *= beta; | |||
c_offset1++; | |||
} | |||
} | |||
} | |||
return 0; | |||
}; |
@@ -0,0 +1,46 @@ | |||
/*************************************************************************** | |||
* Copyright (c) 2024-2025, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
* POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#define ALPHA_ONE | |||
#include "sbgemm_kernel_4x4_neoversev1_impl.c" | |||
#undef ALPHA_ONE | |||
#include "sbgemm_kernel_4x4_neoversev1_impl.c" | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, | |||
FLOAT *C, BLASLONG ldc) { | |||
if (alpha == 1.0f) | |||
return sbgemm_kernel_neoversev1_alpha_one(m, n, k, alpha, A, B, C, ldc); | |||
else | |||
return sbgemm_kernel_neoversev1_alpha(m, n, k, alpha, A, B, C, ldc); | |||
return 0; | |||
} | |||
@@ -0,0 +1,414 @@ | |||
/*************************************************************************** | |||
* Copyright (c) 2024-2025, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
* POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#define INIT_C(M, N) mc##M##N = svdup_f32(0); | |||
#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); | |||
#define INIT_C_4x4 \ | |||
do { \ | |||
INIT_C(0, 0); \ | |||
INIT_C(0, 1); \ | |||
INIT_C(1, 0); \ | |||
INIT_C(1, 1); \ | |||
} while (0); | |||
#ifdef ALPHA_ONE | |||
#define UPDATE_C(PG, PTR, DST, SRC) \ | |||
do { \ | |||
DST = svld1_f32((PG), (PTR)); \ | |||
DST = svadd_z((PG), SRC, DST); \ | |||
svst1_f32((PG), (PTR), DST); \ | |||
} while (0); | |||
#else | |||
#define UPDATE_C(PG, PTR, DST, SRC) \ | |||
do { \ | |||
DST = svld1_f32((PG), (PTR)); \ | |||
DST = svmad_z((PG), svalpha, SRC, DST); \ | |||
svst1_f32((PG), (PTR), DST); \ | |||
} while (0); | |||
#endif | |||
#define ZIP_EVEN_ELEMENTS(PG, mc0, mc1, tmp, vc) \ | |||
do { \ | |||
(tmp) = svuzp1_f32((mc0), (mc1)); \ | |||
(vc) = svcompact_f32((PG), (tmp)); \ | |||
} while (0) | |||
#define ZIP_ODD_ELEMENTS(PG, mc0, mc1, tmp, vc) \ | |||
do { \ | |||
(tmp) = svuzp2_f32((mc0), (mc1)); \ | |||
(vc) = svcompact_f32((PG), (tmp)); \ | |||
} while (0) | |||
#define ACCUMULATE_LAST4_TO_FIRST4(M, N, TMP) \ | |||
do { \ | |||
TMP = svext_f32(mc##M##N, mc##M##N, 4); \ | |||
mc##M##N = svadd_f32_z(svptrue_b32(), mc##M##N, (TMP)); \ | |||
} while (0) | |||
#ifdef ALPHA_ONE | |||
int sbgemm_kernel_neoversev1_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, | |||
FLOAT alpha, IFLOAT *A, IFLOAT *B, | |||
FLOAT *C, BLASLONG ldc) | |||
#else | |||
int sbgemm_kernel_neoversev1_alpha(BLASLONG m, BLASLONG n, BLASLONG k, | |||
FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, | |||
BLASLONG ldc) | |||
#endif | |||
{ | |||
BLASLONG pad_k = (k + 7) & ~7; | |||
svbfloat16_t ma0, ma1, mb0, mb1; | |||
svfloat32_t mc00, mc01, mc10, mc11, vc0, vc1, vc2, vc3, oc0, oc1, oc2, oc3; | |||
svfloat32_t tmp; | |||
svfloat32_t svalpha = svdup_f32(alpha); | |||
svbool_t pg16_all = svptrue_b16(); | |||
svbool_t pg32_first_1 = svwhilelt_b32(0, 1); | |||
svbool_t pg32_first_2 = svwhilelt_b32(0, 2); | |||
svbool_t pg32_first_4 = svwhilelt_b32(0, 4); | |||
svbool_t pg32_select_first_2_per_quadword = svdupq_b32(1, 1, 0, 0); | |||
bfloat16_t *ptr_a = (bfloat16_t *)A; | |||
bfloat16_t *ptr_b = (bfloat16_t *)B; | |||
FLOAT *ptr_c = C; | |||
bfloat16_t *ptr_a0; | |||
bfloat16_t *ptr_b0; | |||
FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; | |||
for (BLASLONG j = 0; j < n / 4; j++) { | |||
ptr_c0 = ptr_c; | |||
ptr_c1 = ptr_c0 + ldc; | |||
ptr_c2 = ptr_c1 + ldc; | |||
ptr_c3 = ptr_c2 + ldc; | |||
ptr_c += 4 * ldc; | |||
ptr_a = (bfloat16_t *)A; | |||
for (BLASLONG i = 0; i < m / 4; i++) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 4 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C_4x4; | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||
MATMUL(0, 0); | |||
MATMUL(0, 1); | |||
MATMUL(1, 0); | |||
MATMUL(1, 1); | |||
ptr_a0 += 32; | |||
ptr_b0 += 32; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(1, 1, tmp); | |||
ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||
ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc1); | |||
ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc2); | |||
ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc3); | |||
UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||
UPDATE_C(pg32_first_4, ptr_c1, oc1, vc1); | |||
UPDATE_C(pg32_first_4, ptr_c2, oc2, vc2) | |||
UPDATE_C(pg32_first_4, ptr_c3, oc3, vc3) | |||
ptr_c0 += 4; | |||
ptr_c1 += 4; | |||
ptr_c2 += 4; | |||
ptr_c3 += 4; | |||
} | |||
if (m & 2) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 2 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
INIT_C(0, 1); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||
MATMUL(0, 0); | |||
MATMUL(0, 1); | |||
ptr_a0 += 16; | |||
ptr_b0 += 32; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||
vc0 = svuzp1(mc00, mc00); | |||
vc1 = svuzp2(mc00, mc00); | |||
vc2 = svuzp1(mc01, mc01); | |||
vc3 = svuzp2(mc01, mc01); | |||
UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||
UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); | |||
UPDATE_C(pg32_first_2, ptr_c2, oc2, vc2); | |||
UPDATE_C(pg32_first_2, ptr_c3, oc3, vc3); | |||
ptr_c0 += 2; | |||
ptr_c1 += 2; | |||
ptr_c2 += 2; | |||
ptr_c3 += 2; | |||
} | |||
if (m & 1) { | |||
ptr_a0 = ptr_a; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
INIT_C(0, 1); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||
MATMUL(0, 0); | |||
MATMUL(0, 1); | |||
ptr_a0 += 16; | |||
ptr_b0 += 32; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||
// use compact is more straightforward | |||
vc1 = svuzp2(mc00, mc00); | |||
vc3 = svuzp2(mc01, mc01); | |||
UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||
UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); | |||
UPDATE_C(pg32_first_1, ptr_c2, oc2, mc01); | |||
UPDATE_C(pg32_first_1, ptr_c3, oc3, vc3); | |||
} | |||
ptr_b += 4 * pad_k; | |||
} | |||
if (n & 2) { | |||
ptr_c0 = ptr_c; | |||
ptr_c1 = ptr_c0 + ldc; | |||
ptr_c += 2 * ldc; | |||
ptr_a = (bfloat16_t *)A; | |||
for (BLASLONG i = 0; i < m / 4; i++) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 4 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
INIT_C(1, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
MATMUL(1, 0); | |||
ptr_a0 += 32; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||
ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||
ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc2); | |||
UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||
UPDATE_C(pg32_first_4, ptr_c1, oc2, vc2); | |||
ptr_c0 += 4; | |||
ptr_c1 += 4; | |||
} | |||
if (m & 2) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 2 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
ptr_a0 += 16; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
vc0 = svuzp1(mc00, mc00); | |||
vc1 = svuzp2(mc00, mc00); | |||
UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||
UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); | |||
ptr_c0 += 2; | |||
ptr_c1 += 2; | |||
} | |||
if (m & 1) { | |||
ptr_a0 = ptr_a; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
ptr_a0 += 16; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
vc1 = svuzp2(mc00, mc00); | |||
UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||
UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); | |||
} | |||
ptr_b += 2 * pad_k; | |||
} | |||
if (n & 1) { // TODO: this case seems a overhead. find out whether it's in our | |||
// case. | |||
ptr_c0 = ptr_c; | |||
ptr_a = (bfloat16_t *)A; | |||
for (BLASLONG i = 0; i < m / 4; i++) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 4 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
INIT_C(1, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
MATMUL(1, 0); | |||
ptr_a0 += 32; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||
ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||
UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||
ptr_c0 += 4; | |||
} | |||
if (m & 2) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 2 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
ptr_a0 += 16; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
vc0 = svuzp1(mc00, mc00); | |||
UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||
ptr_c0 += 2; | |||
} | |||
if (m & 1) { | |||
ptr_a0 = ptr_a; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
ptr_a0 += 16; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,148 @@ | |||
/*************************************************************************** | |||
* Copyright (c) 2024-2025, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
* POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
IFLOAT *a_offset; | |||
IFLOAT *a_offsetx[4]; | |||
IFLOAT *b_offset; | |||
a_offset = a; | |||
b_offset = b; | |||
bfloat16_t zero_value_bf16; | |||
*((uint16_t *)(&zero_value_bf16)) = 0; | |||
svbool_t pg16_all = svptrue_b16(); // 16 elements for sve-256 machine. | |||
svbool_t pg16_first_8 = svwhilelt_b16(0, 8); | |||
svbfloat16_t v0, v1, v2, v3; | |||
svuint64_t t0, t1; | |||
BLASLONG rest = m & 7; | |||
svbool_t pg16_rest = svwhilelt_b16_s32(0, rest); | |||
for (BLASLONG j = 0; j < n / 4; j++) { | |||
a_offsetx[0] = a_offset; | |||
a_offsetx[1] = a_offsetx[0] + lda; | |||
a_offsetx[2] = a_offsetx[1] + lda; | |||
a_offsetx[3] = a_offsetx[2] + lda; | |||
a_offset += 4 * lda; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); | |||
v2 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[2]); | |||
v3 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[3]); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, | |||
svreinterpret_bf16_u64(t1)); | |||
a_offsetx[0] += 8; | |||
a_offsetx[1] += 8; | |||
a_offsetx[2] += 8; | |||
a_offsetx[3] += 8; | |||
b_offset += 32; | |||
} | |||
if (rest) { // remainder along k dim | |||
v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); | |||
v2 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[2]); | |||
v3 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[3]); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, | |||
svreinterpret_bf16_u64(t1)); | |||
b_offset += 32; | |||
} | |||
} | |||
if (n & 2) { | |||
a_offsetx[0] = a_offset; | |||
a_offsetx[1] = a_offsetx[0] + lda; | |||
a_offset += 2 * lda; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
b_offset += 16; | |||
a_offsetx[0] += 8; | |||
a_offsetx[1] += 8; | |||
} | |||
if (rest) { // remainder along k dim | |||
v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
b_offset += 16; | |||
} | |||
} | |||
if (n & 1) { | |||
a_offsetx[0] = a_offset; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svdup_bf16(zero_value_bf16); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
b_offset += 16; | |||
a_offsetx[0] += 8; | |||
} | |||
if (rest) { // remainder along k dim | |||
v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svdup_bf16(zero_value_bf16); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,361 @@ | |||
/*************************************************************************** | |||
* Copyright (c) 2024-2025, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
* POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
#include <arm_sve.h> | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
BLASLONG pad_m = ((m + 7) & ~7); | |||
BLASLONG rest = (m & 7); // rest along m dim | |||
IFLOAT *a_offset; | |||
IFLOAT *a_offset0, *a_offset1, *a_offset2, *a_offset3; | |||
IFLOAT *a_offset4, *a_offset5, *a_offset6, *a_offset7; | |||
IFLOAT *b_offset; | |||
IFLOAT *b_offset0, *b_offset1; | |||
a_offset = a; | |||
b_offset = b; | |||
svuint16_t c0, c1, c2, c3, c4, c5, c6, c7; | |||
svuint16_t t0, t1, t2, t3; | |||
svuint32_t m00, m01, m10, m11; | |||
svuint64_t st_offsets_0, st_offsets_1; | |||
svbool_t pg16_first_4 = svwhilelt_b16(0, 4); | |||
svbool_t pg16_first_8 = svwhilelt_b16(0, 8); | |||
svbool_t pg64_first_4 = svwhilelt_b64(0, 4); | |||
u_int32_t sizeof_u64 = 8; | |||
u_int64_t _st_offsets_0[4] = { | |||
0 * sizeof_u64, | |||
1 * sizeof_u64, | |||
4 * sizeof_u64, | |||
5 * sizeof_u64, | |||
}; | |||
u_int64_t _st_offsets_1[4] = { | |||
2 * sizeof_u64, | |||
3 * sizeof_u64, | |||
6 * sizeof_u64, | |||
7 * sizeof_u64, | |||
}; | |||
st_offsets_0 = svld1_u64(pg64_first_4, _st_offsets_0); | |||
st_offsets_1 = svld1_u64(pg64_first_4, _st_offsets_1); | |||
for (BLASLONG j = 0; j < n / 8; j++) { | |||
a_offset0 = a_offset; | |||
a_offset1 = a_offset0 + lda; | |||
a_offset2 = a_offset1 + lda; | |||
a_offset3 = a_offset2 + lda; | |||
a_offset4 = a_offset3 + lda; | |||
a_offset5 = a_offset4 + lda; | |||
a_offset6 = a_offset5 + lda; | |||
a_offset7 = a_offset6 + lda; | |||
a_offset += 8; | |||
b_offset0 = b_offset; | |||
b_offset1 = b_offset0 + 4 * pad_m; | |||
b_offset += 8 * pad_m; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
// transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 | |||
// small blocks | |||
c0 = svld1_u16(pg16_first_8, a_offset0); | |||
c1 = svld1_u16(pg16_first_8, a_offset1); | |||
c2 = svld1_u16(pg16_first_8, a_offset2); | |||
c3 = svld1_u16(pg16_first_8, a_offset3); | |||
c4 = svld1_u16(pg16_first_8, a_offset4); | |||
c5 = svld1_u16(pg16_first_8, a_offset5); | |||
c6 = svld1_u16(pg16_first_8, a_offset6); | |||
c7 = svld1_u16(pg16_first_8, a_offset7); | |||
t0 = svzip1_u16(c0, c1); | |||
t1 = svzip1_u16(c2, c3); | |||
t2 = svzip1_u16(c4, c5); | |||
t3 = svzip1_u16(c6, c7); | |||
m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_0, svreinterpret_u64_u32(m00)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_1, svreinterpret_u64_u32(m01)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
st_offsets_0, svreinterpret_u64_u32(m10)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
st_offsets_1, svreinterpret_u64_u32(m11)); | |||
a_offset0 += 8 * lda; | |||
a_offset1 += 8 * lda; | |||
a_offset2 += 8 * lda; | |||
a_offset3 += 8 * lda; | |||
a_offset4 += 8 * lda; | |||
a_offset5 += 8 * lda; | |||
a_offset6 += 8 * lda; | |||
a_offset7 += 8 * lda; | |||
b_offset0 += 32; | |||
b_offset1 += 32; | |||
} | |||
if (rest) { | |||
c0 = svld1_u16(pg16_first_8, a_offset0); | |||
c1 = (rest >= 2 ? svld1_u16(pg16_first_8, a_offset1) : svdup_u16(0)); | |||
c2 = (rest >= 3 ? svld1_u16(pg16_first_8, a_offset2) : svdup_u16(0)); | |||
c3 = (rest >= 4 ? svld1_u16(pg16_first_8, a_offset3) : svdup_u16(0)); | |||
c4 = (rest >= 5 ? svld1_u16(pg16_first_8, a_offset4) : svdup_u16(0)); | |||
c5 = (rest >= 6 ? svld1_u16(pg16_first_8, a_offset5) : svdup_u16(0)); | |||
c6 = (rest == 7 ? svld1_u16(pg16_first_8, a_offset6) : svdup_u16(0)); | |||
c7 = (svdup_u16(0)); | |||
t0 = svzip1_u16(c0, c1); | |||
t1 = svzip1_u16(c2, c3); | |||
t2 = svzip1_u16(c4, c5); | |||
t3 = svzip1_u16(c6, c7); | |||
m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_0, svreinterpret_u64_u32(m00)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_1, svreinterpret_u64_u32(m01)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
st_offsets_0, svreinterpret_u64_u32(m10)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
st_offsets_1, svreinterpret_u64_u32(m11)); | |||
} | |||
} | |||
if (n & 4) { | |||
a_offset0 = a_offset; | |||
a_offset1 = a_offset0 + lda; | |||
a_offset2 = a_offset1 + lda; | |||
a_offset3 = a_offset2 + lda; | |||
a_offset4 = a_offset3 + lda; | |||
a_offset5 = a_offset4 + lda; | |||
a_offset6 = a_offset5 + lda; | |||
a_offset7 = a_offset6 + lda; | |||
a_offset += 4; | |||
b_offset0 = b_offset; | |||
b_offset += 4 * pad_m; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
// transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 | |||
// small blocks | |||
c0 = svld1_u16(pg16_first_4, a_offset0); | |||
c1 = svld1_u16(pg16_first_4, a_offset1); | |||
c2 = svld1_u16(pg16_first_4, a_offset2); | |||
c3 = svld1_u16(pg16_first_4, a_offset3); | |||
c4 = svld1_u16(pg16_first_4, a_offset4); | |||
c5 = svld1_u16(pg16_first_4, a_offset5); | |||
c6 = svld1_u16(pg16_first_4, a_offset6); | |||
c7 = svld1_u16(pg16_first_4, a_offset7); | |||
t0 = svzip1_u16(c0, c1); | |||
t1 = svzip1_u16(c2, c3); | |||
t2 = svzip1_u16(c4, c5); | |||
t3 = svzip1_u16(c6, c7); | |||
m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_0, svreinterpret_u64_u32(m00)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_1, svreinterpret_u64_u32(m01)); | |||
a_offset0 += 8 * lda; | |||
a_offset1 += 8 * lda; | |||
a_offset2 += 8 * lda; | |||
a_offset3 += 8 * lda; | |||
a_offset4 += 8 * lda; | |||
a_offset5 += 8 * lda; | |||
a_offset6 += 8 * lda; | |||
a_offset7 += 8 * lda; | |||
b_offset0 += 32; | |||
} | |||
if (rest) { | |||
c0 = svld1_u16(pg16_first_4, a_offset0); // rest >= 1 | |||
c1 = (rest >= 2 ? svld1_u16(pg16_first_4, a_offset1) : svdup_u16(0)); | |||
c2 = (rest >= 3 ? svld1_u16(pg16_first_4, a_offset2) : svdup_u16(0)); | |||
c3 = (rest >= 4 ? svld1_u16(pg16_first_4, a_offset3) : svdup_u16(0)); | |||
c4 = (rest >= 5 ? svld1_u16(pg16_first_4, a_offset4) : svdup_u16(0)); | |||
c5 = (rest >= 6 ? svld1_u16(pg16_first_4, a_offset5) : svdup_u16(0)); | |||
c6 = (rest == 7 ? svld1_u16(pg16_first_4, a_offset6) : svdup_u16(0)); | |||
c7 = (svdup_u16(0)); | |||
t0 = svzip1_u16(c0, c1); | |||
t1 = svzip1_u16(c2, c3); | |||
t2 = svzip1_u16(c4, c5); | |||
t3 = svzip1_u16(c6, c7); | |||
m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_0, svreinterpret_u64_u32(m00)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_1, svreinterpret_u64_u32(m01)); | |||
} | |||
} | |||
if (n & 2) { | |||
a_offset0 = a_offset; | |||
a_offset1 = a_offset0 + lda; | |||
a_offset2 = a_offset1 + lda; | |||
a_offset3 = a_offset2 + lda; | |||
a_offset4 = a_offset3 + lda; | |||
a_offset5 = a_offset4 + lda; | |||
a_offset6 = a_offset5 + lda; | |||
a_offset7 = a_offset6 + lda; | |||
a_offset += 2; | |||
b_offset0 = b_offset; | |||
b_offset1 = b_offset0 + 8; | |||
b_offset += 2 * pad_m; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
for (BLASLONG line = 0; line < 2; line++) { | |||
b_offset0[line * 4] = a_offset0[line]; | |||
b_offset0[line * 4 + 1] = a_offset1[line]; | |||
b_offset0[line * 4 + 2] = a_offset2[line]; | |||
b_offset0[line * 4 + 3] = a_offset3[line]; | |||
b_offset1[line * 4] = a_offset4[line]; | |||
b_offset1[line * 4 + 1] = a_offset5[line]; | |||
b_offset1[line * 4 + 2] = a_offset6[line]; | |||
b_offset1[line * 4 + 3] = a_offset7[line]; | |||
} | |||
b_offset0 += 16; | |||
b_offset1 += 16; | |||
a_offset0 += 8 * lda; | |||
a_offset1 += 8 * lda; | |||
a_offset2 += 8 * lda; | |||
a_offset3 += 8 * lda; | |||
a_offset4 += 8 * lda; | |||
a_offset5 += 8 * lda; | |||
a_offset6 += 8 * lda; | |||
a_offset7 += 8 * lda; | |||
} | |||
if (rest) { | |||
for (BLASLONG line = 0; line < 2; line++) { | |||
b_offset0[line * 4] = a_offset0[line]; | |||
b_offset0[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; | |||
b_offset0[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; | |||
b_offset0[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; | |||
b_offset1[line * 4] = rest <= 4 ? 0 : a_offset4[line]; | |||
b_offset1[line * 4 + 1] = rest <= 5 ? 0 : a_offset5[line]; | |||
b_offset1[line * 4 + 2] = rest <= 6 ? 0 : a_offset6[line]; | |||
b_offset1[line * 4 + 3] = 0; | |||
} | |||
} | |||
} | |||
if (n & 1) { | |||
a_offset0 = a_offset; | |||
a_offset1 = a_offset0 + lda; | |||
a_offset2 = a_offset1 + lda; | |||
a_offset3 = a_offset2 + lda; | |||
a_offset4 = a_offset3 + lda; | |||
a_offset5 = a_offset4 + lda; | |||
a_offset6 = a_offset5 + lda; | |||
a_offset7 = a_offset6 + lda; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
b_offset[0] = a_offset0[0]; | |||
b_offset[1] = a_offset1[0]; | |||
b_offset[2] = a_offset2[0]; | |||
b_offset[3] = a_offset3[0]; | |||
b_offset[4] = 0; | |||
b_offset[5] = 0; | |||
b_offset[6] = 0; | |||
b_offset[7] = 0; | |||
b_offset[8] = a_offset4[0]; | |||
b_offset[9] = a_offset5[0]; | |||
b_offset[10] = a_offset6[0]; | |||
b_offset[11] = a_offset7[0]; | |||
b_offset[12] = 0; | |||
b_offset[13] = 0; | |||
b_offset[14] = 0; | |||
b_offset[15] = 0; | |||
b_offset += 16; | |||
a_offset0 += 8 * lda; | |||
a_offset1 += 8 * lda; | |||
a_offset2 += 8 * lda; | |||
a_offset3 += 8 * lda; | |||
a_offset4 += 8 * lda; | |||
a_offset5 += 8 * lda; | |||
a_offset6 += 8 * lda; | |||
a_offset7 += 8 * lda; | |||
} | |||
if (rest) { | |||
b_offset[0] = *a_offset0; | |||
b_offset[1] = rest == 1 ? 0 : *a_offset1; | |||
b_offset[2] = rest <= 2 ? 0 : *a_offset2; | |||
b_offset[3] = rest <= 3 ? 0 : *a_offset3; | |||
b_offset[4] = 0; | |||
b_offset[5] = 0; | |||
b_offset[6] = 0; | |||
b_offset[7] = 0; | |||
b_offset[8] = rest <= 4 ? 0 : *a_offset4; | |||
b_offset[9] = rest <= 5 ? 0 : *a_offset5; | |||
b_offset[10] = rest <= 6 ? 0 : *a_offset6; | |||
b_offset[11] = 0; | |||
b_offset[12] = 0; | |||
b_offset[13] = 0; | |||
b_offset[14] = 0; | |||
b_offset[15] = 0; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -222,7 +222,7 @@ CNAME(BLASLONG M, | |||
const BLASLONG n8 = N & -8; | |||
const BLASLONG n4 = N & -4; | |||
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||
FLOAT* packed_a = | |||
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
@@ -223,7 +223,7 @@ CNAME(BLASLONG M, | |||
const BLASLONG n8 = N & -8; | |||
const BLASLONG n4 = N & -4; | |||
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||
FLOAT* packed_a = | |||
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
@@ -0,0 +1,40 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include "swap_kernel_sve.c" | |||
#include "swap_kernel_c.c" | |||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
{ | |||
if (n <= 0) | |||
return 0; | |||
if (inc_x == 1 && inc_y == 1) | |||
swap_kernel_sve(n, x, y); | |||
else | |||
swap_kernel_c(n, x, inc_x, y, inc_y); | |||
return (0); | |||
} |
@@ -0,0 +1,46 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <stdio.h> | |||
static int swap_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
BLASLONG i = 0; | |||
BLASLONG ix = 0, iy = 0; | |||
FLOAT temp; | |||
while (i < n) | |||
{ | |||
temp = x[ix]; | |||
x[ix] = y[iy]; | |||
y[iy] = temp; | |||
ix += inc_x; | |||
iy += inc_y; | |||
i++; | |||
} | |||
return (0); | |||
} |