@@ -30,6 +30,15 @@ task: | |||
- cd build | |||
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. | |||
- make | |||
task: | |||
name: AppleM1/GCC/MAKE/OPENMP | |||
compile_script: | |||
- brew install gcc@11 | |||
- export PATH=/opt/homebrew/bin:$PATH | |||
- export LDFLAGS="-L/opt/homebrew/lib" | |||
- export CPPFLAGS="-I/opt/homebrew/include" | |||
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 | |||
macos_instance: | |||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
@@ -151,40 +151,53 @@ jobs: | |||
strategy: | |||
fail-fast: false | |||
matrix: | |||
msystem: [MINGW64, MINGW32, CLANG64] | |||
msystem: [MINGW64, MINGW32, CLANG64, CLANG32] | |||
idx: [int32, int64] | |||
build-type: [Release] | |||
include: | |||
- msystem: MINGW64 | |||
idx: int32 | |||
target-prefix: mingw-w64-x86_64 | |||
fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
fc-pkg: fc | |||
- msystem: MINGW32 | |||
idx: int32 | |||
target-prefix: mingw-w64-i686 | |||
fc-pkg: mingw-w64-i686-gcc-fortran | |||
fc-pkg: fc | |||
- msystem: CLANG64 | |||
idx: int32 | |||
target-prefix: mingw-w64-clang-x86_64 | |||
fc-pkg: fc | |||
# Compiling with Flang 16 seems to cause test errors on machines | |||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. | |||
no-avx512-flags: -DNO_AVX512=1 | |||
- msystem: CLANG32 | |||
idx: int32 | |||
target-prefix: mingw-w64-clang-i686 | |||
fc-pkg: cc | |||
c-lapack-flags: -DC_LAPACK=ON | |||
- msystem: MINGW64 | |||
idx: int64 | |||
idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
target-prefix: mingw-w64-x86_64 | |||
fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
fc-pkg: fc | |||
- msystem: CLANG64 | |||
idx: int64 | |||
idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
target-prefix: mingw-w64-clang-x86_64 | |||
c-lapack-flags: -DC_LAPACK=ON | |||
fc-pkg: fc | |||
# Compiling with Flang 16 seems to cause test errors on machines | |||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. | |||
no-avx512-flags: -DNO_AVX512=1 | |||
- msystem: MINGW64 | |||
idx: int32 | |||
target-prefix: mingw-w64-x86_64 | |||
fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
fc-pkg: fc | |||
build-type: None | |||
exclude: | |||
- msystem: MINGW32 | |||
idx: int64 | |||
- msystem: CLANG32 | |||
idx: int64 | |||
defaults: | |||
run: | |||
@@ -209,7 +222,7 @@ jobs: | |||
install: >- | |||
base-devel | |||
${{ matrix.target-prefix }}-cc | |||
${{ matrix.fc-pkg }} | |||
${{ matrix.target-prefix }}-${{ matrix.fc-pkg }} | |||
${{ matrix.target-prefix }}-cmake | |||
${{ matrix.target-prefix }}-ninja | |||
${{ matrix.target-prefix }}-ccache | |||
@@ -261,6 +274,7 @@ jobs: | |||
-DTARGET=CORE2 \ | |||
${{ matrix.idx64-flags }} \ | |||
${{ matrix.c-lapack-flags }} \ | |||
${{ matrix.no-avx512-flags }} \ | |||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \ | |||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ | |||
.. | |||
@@ -280,9 +294,22 @@ jobs: | |||
key: ${{ steps.ccache-prepare.outputs.key }} | |||
- name: Run tests | |||
id: run-ctest | |||
timeout-minutes: 60 | |||
run: cd build && ctest | |||
- name: Re-run tests | |||
if: always() && (steps.run-ctest.outcome == 'failure') | |||
timeout-minutes: 60 | |||
run: | | |||
cd build | |||
echo "::group::Re-run ctest" | |||
ctest --rerun-failed --output-on-failure || true | |||
echo "::endgroup::" | |||
echo "::group::Log from these tests" | |||
[ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log | |||
echo "::endgroup::" | |||
cross_build: | |||
runs-on: ubuntu-22.04 | |||
@@ -0,0 +1,110 @@ | |||
name: loongarch64 qemu test | |||
on: [push, pull_request] | |||
jobs: | |||
TEST: | |||
runs-on: ubuntu-latest | |||
strategy: | |||
fail-fast: false | |||
matrix: | |||
include: | |||
- target: LOONGSONGENERIC | |||
triple: loongarch64-unknown-linux-gnu | |||
opts: NO_SHARED=1 TARGET=LOONGSONGENERIC | |||
- target: LOONGSON3R5 | |||
triple: loongarch64-unknown-linux-gnu | |||
opts: NO_SHARED=1 TARGET=LOONGSON3R5 | |||
- target: LOONGSON2K1000 | |||
triple: loongarch64-unknown-linux-gnu | |||
opts: NO_SHARED=1 TARGET=LOONGSON2K1000 | |||
steps: | |||
- name: Checkout repository | |||
uses: actions/checkout@v3 | |||
- name: Install APT deps | |||
run: | | |||
sudo add-apt-repository ppa:savoury1/virtualisation | |||
sudo apt-get update | |||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | |||
qemu-user-static | |||
- name: Download and install loongarch64-toolchain | |||
run: | | |||
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz | |||
tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt | |||
- name: Set env | |||
run: | | |||
echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV | |||
echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV | |||
- name: Compilation cache | |||
uses: actions/cache@v3 | |||
with: | |||
path: ~/.ccache | |||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} | |||
restore-keys: | | |||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} | |||
ccache-${{ runner.os }}-${{ matrix.target }} | |||
- name: Configure ccache | |||
run: | | |||
test -d ~/.ccache || mkdir -p ~/.ccache | |||
echo "max_size = 300M" > ~/.ccache/ccache.conf | |||
echo "compression = true" >> ~/.ccache/ccache.conf | |||
ccache -s | |||
- name: Disable utest dsdot:dsdot_n_1 | |||
run: | | |||
echo -n > utest/test_dsdot.c | |||
echo "Due to the qemu versions 7.2 causing utest cases to fail," | |||
echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." | |||
- name: Build OpenBLAS | |||
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) | |||
- name: Test | |||
run: | | |||
qemu-loongarch64-static ./utest/openblas_utest | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 | |||
rm -f ./test/?BLAT2.SUMM | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat | |||
rm -f ./test/?BLAT2.SUMM | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat | |||
rm -f ./test/?BLAT3.SUMM | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat | |||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat | |||
rm -f ./test/?BLAT3.SUMM | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat | |||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat |
@@ -72,6 +72,7 @@ test/SBLAT3.SUMM | |||
test/ZBLAT2.SUMM | |||
test/ZBLAT3.SUMM | |||
test/SHBLAT3.SUMM | |||
test/SBBLAT3.SUMM | |||
test/cblat1 | |||
test/cblat2 | |||
test/cblat3 | |||
@@ -82,6 +83,7 @@ test/sblat1 | |||
test/sblat2 | |||
test/sblat3 | |||
test/test_shgemm | |||
test/test_sbgemm | |||
test/zblat1 | |||
test/zblat2 | |||
test/zblat3 | |||
@@ -7,7 +7,7 @@ pipeline { | |||
stages { | |||
stage('Build') { | |||
steps { | |||
sh 'make' | |||
sh 'make clean && make' | |||
} | |||
} | |||
} | |||
@@ -9,7 +9,7 @@ pipeline { | |||
steps { | |||
sh 'sudo apt update' | |||
sh 'sudo apt install gfortran -y' | |||
sh 'make' | |||
sh 'make clean && make' | |||
} | |||
} | |||
} | |||
@@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d | |||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) | |||
endif | |||
ifeq ($(C_COMPILER), CLANG) | |||
CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) | |||
endif | |||
# | |||
# OS dependent settings | |||
# | |||
@@ -668,6 +673,7 @@ DYNAMIC_CORE += NEOVERSEN1 | |||
ifneq ($(NO_SVE), 1) | |||
DYNAMIC_CORE += NEOVERSEV1 | |||
DYNAMIC_CORE += NEOVERSEN2 | |||
DYNAMIC_CORE += ARMV8SVE | |||
endif | |||
DYNAMIC_CORE += CORTEXA55 | |||
DYNAMIC_CORE += FALKOR | |||
@@ -1086,8 +1092,9 @@ endif | |||
endif | |||
endif | |||
ifeq ($(F_COMPILER), GFORTRAN) | |||
ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) | |||
CCOMMON_OPT += -DF_INTERFACE_GFORT | |||
ifeq ($(F_COMPILER), GFORTRAN) | |||
FCOMMON_OPT += -Wall | |||
# make single-threaded LAPACK calls thread-safe #1847 | |||
FCOMMON_OPT += -frecursive | |||
@@ -1101,6 +1108,7 @@ EXTRALIB += -lgfortran | |||
endif | |||
endif | |||
endif | |||
endif | |||
ifdef NO_BINARY_MODE | |||
ifeq ($(ARCH), $(filter $(ARCH),mips64)) | |||
ifdef BINARY64 | |||
@@ -1767,6 +1775,8 @@ export TARGET_CORE | |||
export NO_AVX512 | |||
export NO_AVX2 | |||
export BUILD_BFLOAT16 | |||
export NO_LSX | |||
export NO_LASX | |||
export SBGEMM_UNROLL_M | |||
export SBGEMM_UNROLL_N | |||
@@ -75,18 +75,31 @@ endif | |||
ifeq ($(CORE), COOPERLAKE) | |||
ifndef NO_AVX512 | |||
ifeq ($(C_COMPILER), GCC) | |||
# cooperlake support was added in 10.1 | |||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | |||
CCOMMON_OPT += -march=cooperlake | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=cooperlake | |||
endif | |||
else # gcc not support, fallback to avx512 | |||
CCOMMON_OPT += -march=skylake-avx512 | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=skylake-avx512 | |||
endif | |||
endif | |||
# cooperlake support was added in 10.1 | |||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | |||
CCOMMON_OPT += -march=cooperlake | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=cooperlake | |||
endif | |||
else # gcc not support, fallback to avx512 | |||
CCOMMON_OPT += -march=skylake-avx512 | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=skylake-avx512 | |||
endif | |||
endif | |||
else ifeq ($(C_COMPILER), CLANG) | |||
# cooperlake support was added in clang 9 | |||
ifeq ($(CLANGVERSIONGTEQ9), 1) | |||
CCOMMON_OPT += -march=cooperlake | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=cooperlake | |||
endif | |||
else # not supported in clang, fallback to avx512 | |||
CCOMMON_OPT += -march=skylake-avx512 | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=skylake-avx512 | |||
endif | |||
endif | |||
endif | |||
ifeq ($(OSNAME), CYGWIN_NT) | |||
CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
@@ -104,18 +117,31 @@ endif | |||
ifeq ($(CORE), SAPPHIRERAPIDS) | |||
ifndef NO_AVX512 | |||
ifeq ($(C_COMPILER), GCC) | |||
# sapphire rapids support was added in 11 | |||
ifeq ($(GCCVERSIONGTEQ11), 1) | |||
CCOMMON_OPT += -march=sapphirerapids | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=sapphirerapids | |||
endif | |||
else # gcc not support, fallback to avx512 | |||
CCOMMON_OPT += -march=skylake-avx512 | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=skylake-avx512 | |||
endif | |||
endif | |||
# sapphire rapids support was added in 11 | |||
ifeq ($(GCCVERSIONGTEQ11), 1) | |||
CCOMMON_OPT += -march=sapphirerapids | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=sapphirerapids | |||
endif | |||
else # gcc not support, fallback to avx512 | |||
CCOMMON_OPT += -march=skylake-avx512 | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=skylake-avx512 | |||
endif | |||
endif | |||
else ifeq ($(C_COMPILER), CLANG) | |||
# cooperlake support was added in clang 12 | |||
ifeq ($(CLANGVERSIONGTEQ12), 1) | |||
CCOMMON_OPT += -march=cooperlake | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=cooperlake | |||
endif | |||
else # not supported in clang, fallback to avx512 | |||
CCOMMON_OPT += -march=skylake-avx512 | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=skylake-avx512 | |||
endif | |||
endif | |||
endif | |||
ifeq ($(OSNAME), CYGWIN_NT) | |||
CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
@@ -271,6 +271,19 @@ jobs: | |||
- script: | | |||
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||
- job: OSX_xbuild_DYNAMIC_ARM64 | |||
pool: | |||
vmImage: 'macOS-11' | |||
variables: | |||
CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64 | |||
steps: | |||
- script: | | |||
ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs | |||
/Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus | |||
/Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version | |||
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||
- job: ALPINE_MUSL | |||
pool: | |||
vmImage: 'ubuntu-latest' | |||
@@ -185,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then | |||
rm -rf "$tmpd" | |||
fi | |||
no_lsx=0 | |||
no_lasx=0 | |||
if [ "$architecture" = "loongarch64" ]; then | |||
tmpd="$(mktemp -d)" | |||
tmplsx="$tmpd/lsx.c" | |||
codelsx='"vadd.b $vr0, $vr0, $vr0"' | |||
lsx_flags='-march=loongarch64 -mlsx' | |||
printf "#include <lsxintrin.h>\n\n" >> "$tmplsx" | |||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" | |||
args="$lsx_flags -o $tmplsx.o $tmplsx" | |||
{ | |||
$compiler_name $flags $args >/dev/null 2>&1 | |||
} || { | |||
no_lsx=1 | |||
} | |||
tmplasx="$tmpd/lasx.c" | |||
codelasx='"xvadd.b $xr0, $xr0, $xr0"' | |||
lasx_flags='-march=loongarch64 -mlasx' | |||
printf "#include <lasxintrin.h>\n\n" >> "$tmplasx" | |||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" | |||
args="$lasx_flags -o $tmplasx.o $tmplasx" | |||
{ | |||
$compiler_name $flags $args >/dev/null 2>&1 | |||
} || { | |||
no_lasx=1 | |||
} | |||
rm -rf "$tmpd" | |||
fi | |||
case "$data" in | |||
*ARCH_X86_64*) architecture=x86_64 ;; | |||
*ARCH_X86*) architecture=x86 ;; | |||
@@ -252,6 +283,9 @@ if [ "$architecture" = "arm64" ]; then | |||
no_sve=0 | |||
{ | |||
$compiler_name $flags $args >/dev/null 2>&1 | |||
} || { | |||
args=" -Msve_intrinsics -c -o $tmpf.o $tmpf" | |||
$compiler_name $flags $args >/dev/null 2>&1 | |||
} || { | |||
no_sve=1 | |||
} | |||
@@ -399,6 +433,8 @@ done | |||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | |||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" | |||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" | |||
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" | |||
[ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n" | |||
} >> "$makefile" | |||
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` | |||
@@ -414,6 +450,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` | |||
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" | |||
[ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" | |||
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" | |||
[ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n" | |||
[ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n" | |||
} >> "$config" | |||
@@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
} | |||
} | |||
$no_lsx = 0; | |||
$no_lasx = 0; | |||
if (($architecture eq "loongarch64")) { | |||
eval "use File::Temp qw(tempfile)"; | |||
if ($@){ | |||
warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility"; | |||
} else { | |||
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); | |||
$codelsx = '"vadd.b $vr0, $vr0, $vr0"'; | |||
$lsx_flags = "-march=loongarch64 -mlsx"; | |||
print $tmplsx "#include <lsxintrin.h>\n\n"; | |||
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; | |||
$args = "$lsx_flags -o $tmplsx.o $tmplsx"; | |||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
system(@cmd) == 0; | |||
if ($? != 0) { | |||
$no_lsx = 1; | |||
} else { | |||
$no_lsx = 0; | |||
} | |||
unlink("$tmplsx.o"); | |||
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); | |||
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; | |||
$lasx_flags = "-march=loongarch64 -mlasx"; | |||
print $tmplasx "#include <lasxintrin.h>\n\n"; | |||
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; | |||
$args = "$lasx_flags -o $tmplasx.o $tmplasx"; | |||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
system(@cmd) == 0; | |||
if ($? != 0) { | |||
$no_lasx = 1; | |||
} else { | |||
$no_lasx = 0; | |||
} | |||
unlink("$tmplasx.o"); | |||
} | |||
} | |||
$architecture = x86 if ($data =~ /ARCH_X86/); | |||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
$architecture = e2k if ($data =~ /ARCH_E2K/); | |||
@@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1; | |||
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; | |||
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; | |||
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; | |||
print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1; | |||
print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1; | |||
$os =~ tr/[a-z]/[A-Z]/; | |||
$architecture =~ tr/[a-z]/[A-Z]/; | |||
@@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; | |||
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; | |||
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; | |||
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; | |||
print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1; | |||
print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1; | |||
if ($os eq "LINUX") { | |||
@@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) | |||
if (ARM64) | |||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) | |||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) | |||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) | |||
endif () | |||
if (DYNAMIC_LIST) | |||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
@@ -135,7 +135,7 @@ if (ARM64) | |||
set(BINARY_DEFINED 1) | |||
endif () | |||
if (${ARCH} STREQUAL "riscv64") | |||
if (RISCV64) | |||
set(NO_BINARY_MODE 1) | |||
set(BINARY_DEFINED 1) | |||
endif () | |||
@@ -180,22 +180,30 @@ endif () | |||
if (${CORE} STREQUAL NEOVERSEN2) | |||
if (NOT DYNAMIC_ARCH) | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
endif() | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
endif() | |||
endif () | |||
endif () | |||
endif () | |||
if (${CORE} STREQUAL NEOVERSEV1) | |||
if (NOT DYNAMIC_ARCH) | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
endif() | |||
endif() | |||
endif () | |||
endif () | |||
@@ -213,7 +221,11 @@ endif () | |||
if (${CORE} STREQUAL ARMV8SVE) | |||
if (NOT DYNAMIC_ARCH) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
endif () | |||
endif () | |||
endif () | |||
@@ -3,7 +3,8 @@ | |||
## Description: Ported from portion of OpenBLAS/Makefile.system | |||
## Sets Fortran related variables. | |||
if (${F_COMPILER} STREQUAL "FLANG") | |||
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
# This is for classic Flang. LLVM Flang is handled with gfortran below. | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
if (BINARY64 AND INTERFACE64) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8") | |||
@@ -38,15 +39,17 @@ if (${F_COMPILER} STREQUAL "G95") | |||
endif () | |||
endif () | |||
if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") | |||
if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") | |||
# ensure reentrancy of lapack codes | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") | |||
# work around ABI violation in passing string arguments from C | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") | |||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||
if (NOT NO_LAPACK) | |||
set(EXTRALIB "${EXTRALIB} -lgfortran") | |||
if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
# ensure reentrancy of lapack codes | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") | |||
# work around ABI violation in passing string arguments from C | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") | |||
if (NOT NO_LAPACK) | |||
# Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||
set(EXTRALIB "${EXTRALIB} -lgfortran") | |||
endif () | |||
endif () | |||
if (NO_BINARY_MODE) | |||
if (MIPS64) | |||
@@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") | |||
endif () | |||
endif () | |||
if (RISCV64) | |||
if (BINARY64) | |||
if (INTERFACE64) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") | |||
endif () | |||
endif () | |||
endif () | |||
else () | |||
if (BINARY64) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | |||
@@ -282,23 +282,35 @@ if (DEFINED TARGET) | |||
endif() | |||
if (${TARGET} STREQUAL NEOVERSEV1) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
else () | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
else () | |||
message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.") | |||
endif() | |||
endif() | |||
endif() | |||
if (${TARGET} STREQUAL NEOVERSEN2) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
else () | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
else () | |||
message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.") | |||
endif() | |||
endif() | |||
endif() | |||
if (${TARGET} STREQUAL ARMV8SVE) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve") | |||
else () | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") | |||
endif() | |||
endif() | |||
endif() | |||
@@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | |||
set(MIPS64 1) | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") | |||
set(LOONGARCH64 1) | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*") | |||
set(RISCV64 1) | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | |||
if (NOT BINARY) | |||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
@@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | |||
endif() | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") | |||
set(X86 1) | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)") | |||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
set(ARM64 1) | |||
else() | |||
@@ -107,7 +109,7 @@ else() | |||
endif () | |||
if (NOT BINARY) | |||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) | |||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64) | |||
set(BINARY 64) | |||
else () | |||
set(BINARY 32) | |||
@@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads); | |||
/* Global Parameter */ | |||
extern int blas_cpu_number; | |||
extern int blas_num_threads; | |||
extern int blas_num_threads_set; | |||
extern int blas_omp_linked; | |||
#define BLAS_LEGACY 0x8000U | |||
@@ -136,15 +135,13 @@ typedef struct blas_queue { | |||
#ifdef SMP_SERVER | |||
extern int blas_server_avail; | |||
extern int blas_omp_number_max; | |||
static __inline int num_cpu_avail(int level) { | |||
#ifdef USE_OPENMP | |||
int openmp_nthreads; | |||
if (blas_num_threads_set == 0) | |||
openmp_nthreads=omp_get_max_threads(); | |||
else | |||
openmp_nthreads=blas_cpu_number; | |||
#endif | |||
#ifndef USE_OPENMP | |||
@@ -156,7 +153,13 @@ int openmp_nthreads; | |||
) return 1; | |||
#ifdef USE_OPENMP | |||
if (blas_cpu_number != openmp_nthreads) { | |||
if (openmp_nthreads > blas_omp_number_max){ | |||
#ifdef DEBUG | |||
fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max); | |||
#endif | |||
openmp_nthreads = blas_omp_number_max; | |||
} | |||
if (blas_cpu_number != openmp_nthreads) { | |||
goto_set_num_threads(openmp_nthreads); | |||
} | |||
#endif | |||
@@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
**********************************************************************************/ | |||
#include <stdint.h> | |||
#include <sys/auxv.h> | |||
/* If LASX extension instructions supported, | |||
* using core LOONGSON3R5 | |||
@@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CPU_LOONGSON3R5 1 | |||
#define CPU_LOONGSON2K1000 2 | |||
#define LOONGARCH_CFG2 0x02 | |||
#define LOONGARCH_LASX 1<<7 | |||
#define LOONGARCH_LSX 1<<6 | |||
#define LA_HWCAP_LSX (1<<4) | |||
#define LA_HWCAP_LASX (1<<5) | |||
static char *cpuname[] = { | |||
"LOONGSONGENERIC", | |||
@@ -64,17 +64,11 @@ static char *cpuname_lower[] = { | |||
int detect(void) { | |||
#ifdef __linux | |||
uint32_t reg = 0; | |||
int flag = (int)getauxval(AT_HWCAP); | |||
__asm__ volatile ( | |||
"cpucfg %0, %1 \n\t" | |||
: "+&r"(reg) | |||
: "r"(LOONGARCH_CFG2) | |||
); | |||
if (reg & LOONGARCH_LASX) | |||
if (flag & LA_HWCAP_LASX) | |||
return CPU_LOONGSON3R5; | |||
else if (reg & LOONGARCH_LSX) | |||
else if (flag & LA_HWCAP_LSX) | |||
return CPU_LOONGSON2K1000; | |||
else | |||
return CPU_GENERIC; | |||
@@ -1551,6 +1551,7 @@ int get_cpuname(void){ | |||
case 7: // Raptor Lake | |||
case 10: | |||
case 15: | |||
case 14: // Alder Lake N | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
@@ -2360,6 +2361,7 @@ int get_coretype(void){ | |||
case 7: // Raptor Lake | |||
case 10: | |||
case 15: | |||
case 14: // Alder Lake N | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CORE_HASWELL; | |||
@@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||
ifeq ($(USE_OPENMP), 1) | |||
ifeq ($(F_COMPILER), GFORTRAN) | |||
ifeq ($(C_COMPILER), CLANG) | |||
CEXTRALIB = -lomp | |||
CEXTRALIB += -lomp | |||
endif | |||
endif | |||
ifeq ($(F_COMPILER), NAG) | |||
@@ -0,0 +1,270 @@ | |||
# Guidance for redistributing OpenBLAS | |||
*We note that this document contains recommendations only - packagers and other | |||
redistributors are in charge of how OpenBLAS is built and distributed in their | |||
systems, and may have good reasons to deviate from the guidance given on this | |||
page. These recommendations are aimed at general packaging systems, with a user | |||
base that typically is large, open source (or freely available at least), and | |||
doesn't behave uniformly or that the packager is directly connected with.* | |||
OpenBLAS has a large number of build-time options which can be used to change | |||
how it behaves at runtime, how artifacts or symbols are named, etc. Variation | |||
in build configuration can be necessary to acheive a given end goal within a | |||
distribution or as an end user. However, such variation can also make it more | |||
difficult to build on top of OpenBLAS and ship code or other packages in a way | |||
that works across many different distros. Here we provide guidance about the | |||
most important build options, what effects they may have when changed, and | |||
which ones to default to. | |||
The Make and CMake build systems provide equivalent options and yield more or | |||
less the same artifacts, but not exactly (the CMake builds are still | |||
experimental). You can choose either one and the options will function in the | |||
same way, however the CMake outputs may require some renaming. To review | |||
available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of | |||
the repository. | |||
Build options typically fall into two categories: (a) options that affect the | |||
user interface, such as library and symbol names or APIs that are made | |||
available, and (b) options that affect performance and runtime behavior, such | |||
as threading behavior or CPU architecture-specific code paths. The user | |||
interface options are more important to keep aligned between distributions, | |||
while for the performance-related options there are typically more reasons to | |||
make choices that deviate from the defaults. | |||
Here are recommendations for user interface related packaging choices where it | |||
is not likely to be a good idea to deviate (typically these are the default | |||
settings): | |||
1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect | |||
binary size much, so don't turn it off. | |||
2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and | |||
while it does make up a significant part of the binary size of the installed | |||
library, that does not outweigh the regression in usability when deviating | |||
from the default here.[^1] | |||
3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency | |||
detection files. These files are used by build systems when users want to | |||
link against OpenBLAS, and there is no benefit of leaving them out. | |||
4. Provide the LP64 interface by default, and if in addition to that you choose | |||
to provide an ILP64 interface build as well, use a symbol suffix to avoid | |||
symbol name clashes (see the next section). | |||
[^1] All major distributions do include LAPACK as of mid 2023 as far as we | |||
know. Older versions of Arch Linux did not, and that was known to cause | |||
problems. | |||
## ILP64 interface builds | |||
The LP64 (32-bit integer) interface is the default build, and has | |||
well-established C and Fortran APIs as determined by the reference (Netlib) | |||
BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does | |||
not have a standard API: symbol names and shared/static library names can be | |||
produced in multiple ways, and this tends to make it difficult to use. | |||
As of today there is an agreed-upon way of choosing names for OpenBLAS between | |||
a number of key users/redistributors, which is the closest thing to a standard | |||
that there is now. However, there is an ongoing standardization effort in the | |||
reference BLAS and LAPACK libraries, which differs from the current OpenBLAS | |||
agreed-upon convention. In this section we'll aim to explain both. | |||
Those two methods are fairly similar, and have a key thing in common: *using a | |||
symbol suffix*. This is good practice; it is recommended that if you distribute | |||
an ILP64 build, to have it use a symbol suffix containing `64` in the name. | |||
This avoids potential symbol clashes when different packages which depend on | |||
OpenBLAS load both an LP64 and an ILP64 library into memory at the same time. | |||
### The current OpenBLAS agreed-upon ILP64 convention | |||
This convention comprises the shared library name and the symbol suffix in the | |||
shared library. The symbol suffix to use is `64_`, implying that the library | |||
name will be `libopenblas64_.so` and the symbols in that library end in `64_`. | |||
The central issue where this was discussed is | |||
[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters | |||
include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well. | |||
To build shared and static libraries with the currently recommended ILP64 | |||
conventions with Make: | |||
```bash | |||
$ make INTERFACE64=1 SYMBOLSUFFIX=64_ | |||
``` | |||
This will produce libraries named `libopenblas64_.so|a`, a pkg-config file | |||
named `openblas64.pc`, and CMake and header files. | |||
Installing locally and inspecting the output will show a few more details: | |||
```bash | |||
$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_ | |||
$ tree . # output slightly edited down | |||
. | |||
├── include | |||
│ ├── cblas.h | |||
│ ├── f77blas.h | |||
│ ├── lapacke_config.h | |||
│ ├── lapacke.h | |||
│ ├── lapacke_mangling.h | |||
│ ├── lapacke_utils.h | |||
│ ├── lapack.h | |||
│ └── openblas_config.h | |||
└── lib | |||
├── cmake | |||
│ └── openblas | |||
│ ├── OpenBLASConfig.cmake | |||
│ └── OpenBLASConfigVersion.cmake | |||
├── libopenblas64_.a | |||
├── libopenblas64_.so | |||
└── pkgconfig | |||
└── openblas64.pc | |||
``` | |||
A key point are the symbol names. These will equal the LP64 symbol names, then | |||
(for Fortran only) the compiler mangling, and then the `64_` symbol suffix. | |||
Hence to obtain the final symbol names, we need to take into account which | |||
Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel | |||
Fortran, or Flang), that means appending a single underscore. In that case, the | |||
result is: | |||
| base API name | binary symbol name | call from Fortran code | call from C code | | |||
|---------------|--------------------|------------------------|-----------------------| | |||
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | | |||
| `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` | | |||
It is quite useful to have these symbol names be as uniform as possible across | |||
different packaging systems. | |||
The equivalent build options with CMake are: | |||
```bash | |||
$ mkdir build && cd build | |||
$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON | |||
$ cmake --build . -j | |||
``` | |||
Note that the result is not 100% identical to the Make result. For example, the | |||
library name ends in `_64` rather than `64_` - it is recommended to rename them | |||
to match the Make library names (also update the `libsuffix` entry in | |||
`openblas64.pc` to match that rename). | |||
```bash | |||
$ cmake --install . --prefix $PWD/../../openblas/cmake64 | |||
$ tree . | |||
. | |||
├── include | |||
│ └── openblas64 | |||
│ ├── cblas.h | |||
│ ├── f77blas.h | |||
│ ├── lapacke_config.h | |||
│ ├── lapacke_example_aux.h | |||
│ ├── lapacke.h | |||
│ ├── lapacke_mangling.h | |||
│ ├── lapacke_utils.h | |||
│ ├── lapack.h | |||
│ ├── openblas64 | |||
│ │ └── lapacke_mangling.h | |||
│ └── openblas_config.h | |||
└── lib | |||
├── cmake | |||
│ └── OpenBLAS64 | |||
│ ├── OpenBLAS64Config.cmake | |||
│ ├── OpenBLAS64ConfigVersion.cmake | |||
│ ├── OpenBLAS64Targets.cmake | |||
│ └── OpenBLAS64Targets-noconfig.cmake | |||
├── libopenblas_64.a | |||
├── libopenblas_64.so -> libopenblas_64.so.0 | |||
└── pkgconfig | |||
└── openblas64.pc | |||
``` | |||
### The upcoming standardized ILP64 convention | |||
While the `64_` convention above got some adoption, it's slightly hacky and is | |||
implemented through the use of `objcopy`. An effort is ongoing for a more | |||
broadly adopted convention in the reference BLAS and LAPACK libraries, using | |||
(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after | |||
Fortran compiler mangling. The central issue for this is | |||
[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666). | |||
For the most common cases of compiler mangling (a single `_` appended), the end | |||
result will be: | |||
| base API name | binary symbol name | call from Fortran code | call from C code | | |||
|---------------|--------------------|------------------------|-----------------------| | |||
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | | |||
| `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` | | |||
For other compiler mangling schemes, replace the trailing `_` by the scheme in use. | |||
The shared library name for this `_64` convention should be `libopenblas_64.so`. | |||
Note: it is not yet possible to produce an OpenBLAS build which employs this | |||
convention! Once reference BLAS and LAPACK with support for `_64` have been | |||
released, a future OpenBLAS release will support it. For now, please use the | |||
older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be | |||
considered reserved for future use of the `_64` standard as prescribed by | |||
reference BLAS/LAPACK. | |||
## Performance and runtime behavior related build options | |||
For these options there are multiple reasonable or common choices. | |||
### Threading related options | |||
OpenBLAS can be built as a multi-threaded or single-threaded library, with the | |||
default being multi-threaded. It's expected that the default `libopenblas` | |||
library is multi-threaded; if you'd like to also distribute single-threaded | |||
builds, consider naming them `libopenblas_sequential`. | |||
OpenBLAS can be built with pthreads or OpenMP as the threading model, with the | |||
default being pthreads. Both options are commonly used, and the choice here | |||
should not influence the shared library name. The choice will be captured by | |||
the `.pc` file. E.g.,: | |||
```bash | |||
$ pkg-config --libs openblas | |||
-fopenmp -lopenblas | |||
$ cat openblas.pc | |||
... | |||
openblas_config= ... USE_OPENMP=0 MAX_THREADS=24 | |||
``` | |||
The maximum number of threads users will be able to use is determined at build | |||
time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide | |||
range of values that are reasonable to use (up to 256). 64 is a typical choice | |||
here; there is a memory footprint penalty that is linear in `NUM_THREADS`. | |||
Please see `Makefile.rule` for more details. | |||
### CPU architecture related options | |||
OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when | |||
distributing to a user base with a variety of hardware, it is recommended to | |||
enable CPU architecture runtime detection. This will dynamically select | |||
optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1` | |||
build option. This is usually done on all common CPU families, except when | |||
there are known issues. | |||
In case the CPU architecture is known (e.g. you're building binaries for macOS | |||
M1 users), it is possible to specify the target architecture directly with the | |||
`TARGET=` build option. | |||
`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md` | |||
in this repository. | |||
## Real-world examples | |||
OpenBLAS is likely to be distributed in one of these distribution models: | |||
1. As a standalone package, or multiple packages, in a packaging ecosystem like | |||
a Linux distro, Homebrew, conda-forge or MSYS2. | |||
2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R. | |||
3. Locally, e.g. making available as a build on a single HPC cluster. | |||
The guidance on this page is most important for models (1) and (2). These links | |||
to build recipes for a representative selection of packaging systems may be | |||
helpful as a reference: | |||
- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec) | |||
- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules) | |||
- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb) | |||
- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD) | |||
- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh) | |||
- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh) | |||
- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix) |
@@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) { | |||
increased_threads = 1; | |||
for(i = blas_num_threads - 1; i < num_threads - 1; i++){ | |||
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); | |||
thread_status[i].status = THREAD_STATUS_WAKEUP; | |||
@@ -68,6 +68,7 @@ | |||
#endif | |||
int blas_server_avail = 0; | |||
int blas_omp_number_max = 0; | |||
extern int openblas_omp_adaptive_env(); | |||
@@ -100,8 +101,6 @@ static void adjust_thread_buffers() { | |||
void goto_set_num_threads(int num_threads) { | |||
blas_num_threads_set = 1; | |||
if (num_threads < 0) blas_num_threads_set = 0; | |||
if (num_threads < 1) num_threads = blas_num_threads; | |||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
@@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) { | |||
} | |||
int blas_thread_init(void){ | |||
if(blas_omp_number_max <= 0) | |||
blas_omp_number_max = omp_get_max_threads(); | |||
blas_get_cpu_number(); | |||
@@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads) | |||
blas_server_avail = 1; | |||
} | |||
for(i = blas_num_threads - 1; i < num_threads - 1; i++){ | |||
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||
blas_threads[i] = CreateThread(NULL, 0, | |||
blas_thread_server, (void *)i, | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2; | |||
#else | |||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
#endif | |||
#ifdef DYN_ARMV8SVE | |||
extern gotoblas_t gotoblas_ARMV8SVE; | |||
#else | |||
#define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
#endif | |||
#ifdef DYN_CORTEX_A55 | |||
extern gotoblas_t gotoblas_CORTEXA55; | |||
#else | |||
@@ -128,17 +134,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1; | |||
#ifndef NO_SVE | |||
extern gotoblas_t gotoblas_NEOVERSEV1; | |||
extern gotoblas_t gotoblas_NEOVERSEN2; | |||
extern gotoblas_t gotoblas_ARMV8SVE; | |||
#else | |||
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | |||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
#define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
#endif | |||
extern gotoblas_t gotoblas_THUNDERX3T110; | |||
extern gotoblas_t gotoblas_CORTEXA55; | |||
#endif | |||
extern void openblas_warning(int verbose, const char * msg); | |||
#define FALLBACK_VERBOSE 1 | |||
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | |||
#define NUM_CORETYPES 13 | |||
#define NUM_CORETYPES 16 | |||
/* | |||
* In case asm/hwcap.h is outdated on the build system, make sure | |||
@@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg); | |||
#ifndef HWCAP_CPUID | |||
#define HWCAP_CPUID (1 << 11) | |||
#endif | |||
#ifndef HWCAP_SVE | |||
#define HWCAP_SVE (1 << 22) | |||
#endif | |||
#define get_cpu_ftr(id, var) ({ \ | |||
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | |||
@@ -168,6 +181,7 @@ static char *corename[] = { | |||
"neoversen2", | |||
"thunderx3t110", | |||
"cortexa55", | |||
"armv8sve", | |||
"unknown" | |||
}; | |||
@@ -187,6 +201,7 @@ char *gotoblas_corename(void) { | |||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; | |||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; | |||
if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; | |||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; | |||
return corename[NUM_CORETYPES]; | |||
} | |||
@@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
case 12: return (&gotoblas_NEOVERSEN2); | |||
case 13: return (&gotoblas_THUNDERX3T110); | |||
case 14: return (&gotoblas_CORTEXA55); | |||
case 15: return (&gotoblas_ARMV8SVE); | |||
} | |||
snprintf(message, 128, "Core not found: %s\n", coretype); | |||
openblas_warning(1, message); | |||
@@ -281,9 +297,17 @@ static gotoblas_t *get_coretype(void) { | |||
return &gotoblas_NEOVERSEN1; | |||
#ifndef NO_SVE | |||
case 0xd49: | |||
return &gotoblas_NEOVERSEN2; | |||
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); | |||
return &gotoblas_NEOVERSEN1; | |||
} else | |||
return &gotoblas_NEOVERSEN2; | |||
case 0xd40: | |||
return &gotoblas_NEOVERSEV1; | |||
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); | |||
return &gotoblas_NEOVERSEN1; | |||
}else | |||
return &gotoblas_NEOVERSEV1; | |||
#endif | |||
case 0xd05: // Cortex A55 | |||
return &gotoblas_CORTEXA55; | |||
@@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) { | |||
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); | |||
openblas_warning(1, coremsg); | |||
} | |||
#ifndef NO_SVE | |||
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
return &gotoblas_ARMV8SVE; | |||
} | |||
#endif | |||
return NULL; | |||
#endif | |||
} | |||
@@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s | |||
*/ | |||
int blas_num_threads = 0; | |||
int blas_num_threads_set = 0; | |||
int goto_get_num_procs (void) { | |||
return blas_cpu_number; | |||
} | |||
@@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s | |||
*/ | |||
int blas_num_threads = 0; | |||
int blas_num_threads_set = 0; | |||
int goto_get_num_procs (void) { | |||
return blas_cpu_number; | |||
} | |||
@@ -283,7 +283,6 @@ The numbers of threads in the thread pool. | |||
This value is equal or large than blas_cpu_number. This means some threads are sleep. | |||
*/ | |||
int blas_num_threads = 0; | |||
int blas_num_threads_set = 0; | |||
int goto_get_num_procs (void) { | |||
return blas_cpu_number; | |||
@@ -101,7 +101,14 @@ else | |||
*flang*) | |||
vendor=FLANG | |||
openmp='-fopenmp' | |||
;; | |||
data=`$compiler -v 2>&1 > /dev/null ` | |||
v="${data#*version *}" | |||
v="${v%%*.}" | |||
major="${v%%.*}" | |||
if [ "$major" -ge 17 ]; then | |||
vendor=FLANGNEW | |||
fi | |||
;; | |||
*ifort*|*ifx*) | |||
vendor=INTEL | |||
openmp='-fopenmp' | |||
@@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, | |||
info = 0; | |||
if (lda < MAX(1, m)) info = 6; | |||
if (lda < MAX(1, m)) info = 5; | |||
if (ldc < MAX(1, m)) info = 8; | |||
if (n < 0) info = 2; | |||
@@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
if (n <= 0) return 0.; | |||
#ifndef COMPLEX | |||
if (n == 1) | |||
#ifdef DOUBLE | |||
return fabs(x[0]); | |||
#else | |||
return fabsf(x[0]); | |||
#endif | |||
#endif | |||
if (incx < 0) | |||
#ifdef COMPLEX | |||
x -= (n - 1) * incx * 2; | |||
#else | |||
x -= (n - 1) * incx; | |||
#endif | |||
IDEBUG_START; | |||
FUNCTION_PROFILE_START(); | |||
@@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
if (n <= 0) return 0.; | |||
#ifndef COMPLEX | |||
if (n == 1) | |||
#ifdef DOUBLE | |||
return fabs(x[0]); | |||
#else | |||
return fabsf(x[0]); | |||
#endif | |||
#endif | |||
if (incx < 0) | |||
#ifdef COMPLEX | |||
x -= (n - 1) * incx * 2; | |||
#else | |||
x -= (n - 1) * incx; | |||
#endif | |||
IDEBUG_START; | |||
FUNCTION_PROFILE_START(); | |||
@@ -33,7 +33,7 @@ endif | |||
ifdef TARGET_CORE | |||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
ifeq ($(GCCVERSIONGTEQ11), 1) | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) | |||
override CFLAGS += -march=sapphirerapids | |||
else | |||
override CFLAGS += -march=skylake-avx512 -mavx512f | |||
@@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
endif | |||
else ifeq ($(TARGET_CORE), COOPERLAKE) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
ifeq ($(GCCVERSIONGTEQ10), 1) | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9))) | |||
override CFLAGS += -march=cooperlake | |||
else | |||
override CFLAGS += -march=skylake-avx512 -mavx512f | |||
@@ -77,6 +77,12 @@ else ifeq ($(TARGET_CORE), ZEN) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | |||
else ifeq ($(TARGET_CORE), LOONGSON3R4) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) | |||
else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),) | |||
ifeq ($(C_COMPILER), PGI) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics | |||
else | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
endif | |||
else | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
endif | |||
@@ -35,6 +35,12 @@ USE_TRMM = 1 | |||
endif | |||
endif | |||
ifneq ($(DYNAMIC_ARCH), 1) | |||
ifeq ($(TARGET), MIPS64_GENERIC) | |||
USE_TRMM = 1 | |||
endif | |||
endif | |||
ifeq ($(CORE), HASWELL) | |||
USE_TRMM = 1 | |||
endif | |||
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
FLOAT absxi = 0.0; | |||
if (n <= 0 || inc_x <= 0) return(0.0); | |||
if (n <= 0 || inc_x == 0) return(0.0); | |||
if ( n == 1 ) return( ABS(x[0]) ); | |||
n *= inc_x; | |||
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
BLASLONG inc_x2; | |||
FLOAT temp; | |||
if (n <= 0 || inc_x <= 0) return(0.0); | |||
if (n <= 0 || inc_x == 0) return(0.0); | |||
inc_x2 = 2 * inc_x; | |||
@@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S | |||
ZAMAXKERNEL = zamax.S | |||
SAXPYKERNEL = axpy.S | |||
DAXPYKERNEL = axpy.S | |||
DAXPYKERNEL = daxpy_thunderx2t99.S | |||
CAXPYKERNEL = zaxpy.S | |||
ZAXPYKERNEL = zaxpy.S | |||
@@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S | |||
CGEMVTKERNEL = zgemv_t.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SASUMKERNEL = asum.S | |||
DASUMKERNEL = asum.S | |||
CASUMKERNEL = casum.S | |||
ZASUMKERNEL = zasum.S | |||
SCOPYKERNEL = copy.S | |||
DCOPYKERNEL = copy.S | |||
CCOPYKERNEL = copy.S | |||
ZCOPYKERNEL = copy.S | |||
SSWAPKERNEL = swap.S | |||
DSWAPKERNEL = swap.S | |||
CSWAPKERNEL = swap.S | |||
ZSWAPKERNEL = swap.S | |||
ISAMAXKERNEL = iamax.S | |||
IDAMAXKERNEL = iamax.S | |||
ICAMAXKERNEL = izamax.S | |||
IZAMAXKERNEL = izamax.S | |||
SNRM2KERNEL = nrm2.S | |||
DNRM2KERNEL = nrm2.S | |||
CNRM2KERNEL = znrm2.S | |||
ZNRM2KERNEL = znrm2.S | |||
DDOTKERNEL = dot.S | |||
ifneq ($(C_COMPILER), PGI) | |||
SDOTKERNEL = ../generic/dot.c | |||
else | |||
SDOTKERNEL = dot.S | |||
endif | |||
ifneq ($(C_COMPILER), PGI) | |||
CDOTKERNEL = zdot.S | |||
ZDOTKERNEL = zdot.S | |||
else | |||
CDOTKERNEL = ../arm/zdot.c | |||
ZDOTKERNEL = ../arm/zdot.c | |||
endif | |||
SASUMKERNEL = sasum_thunderx2t99.c | |||
DASUMKERNEL = dasum_thunderx2t99.c | |||
CASUMKERNEL = casum_thunderx2t99.c | |||
ZASUMKERNEL = zasum_thunderx2t99.c | |||
SCOPYKERNEL = copy_thunderx2t99.c | |||
DCOPYKERNEL = copy_thunderx2t99.c | |||
CCOPYKERNEL = copy_thunderx2t99.c | |||
ZCOPYKERNEL = copy_thunderx2t99.c | |||
SSWAPKERNEL = swap_thunderx2t99.S | |||
DSWAPKERNEL = swap_thunderx2t99.S | |||
CSWAPKERNEL = swap_thunderx2t99.S | |||
ZSWAPKERNEL = swap_thunderx2t99.S | |||
ISAMAXKERNEL = iamax_thunderx2t99.c | |||
IDAMAXKERNEL = iamax_thunderx2t99.c | |||
ICAMAXKERNEL = izamax_thunderx2t99.c | |||
IZAMAXKERNEL = izamax_thunderx2t99.c | |||
SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
DDOTKERNEL = dot.c | |||
SDOTKERNEL = dot.c | |||
CDOTKERNEL = zdot_thunderx2t99.c | |||
ZDOTKERNEL = zdot_thunderx2t99.c | |||
DSDOTKERNEL = dot.S | |||
DGEMM_BETA = dgemm_beta.S | |||
@@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c | |||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
@@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
@@ -1,98 +1 @@ | |||
include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
DAXPYKERNEL = daxpy_thunderx2t99.S | |||
SASUMKERNEL = sasum_thunderx2t99.c | |||
DASUMKERNEL = dasum_thunderx2t99.c | |||
CASUMKERNEL = casum_thunderx2t99.c | |||
ZASUMKERNEL = zasum_thunderx2t99.c | |||
SCOPYKERNEL = copy_thunderx2t99.c | |||
DCOPYKERNEL = copy_thunderx2t99.c | |||
CCOPYKERNEL = copy_thunderx2t99.c | |||
ZCOPYKERNEL = copy_thunderx2t99.c | |||
SSWAPKERNEL = swap_thunderx2t99.S | |||
DSWAPKERNEL = swap_thunderx2t99.S | |||
CSWAPKERNEL = swap_thunderx2t99.S | |||
ZSWAPKERNEL = swap_thunderx2t99.S | |||
ISAMAXKERNEL = iamax_thunderx2t99.c | |||
IDAMAXKERNEL = iamax_thunderx2t99.c | |||
ICAMAXKERNEL = izamax_thunderx2t99.c | |||
IZAMAXKERNEL = izamax_thunderx2t99.c | |||
SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
DDOTKERNEL = dot.c | |||
SDOTKERNEL = dot.c | |||
CDOTKERNEL = zdot_thunderx2t99.c | |||
ZDOTKERNEL = zdot_thunderx2t99.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRMMUNCOPY_M = | |||
CTRMMLNCOPY_M = | |||
CTRMMUTCOPY_M = | |||
CTRMMLTCOPY_M = | |||
CHEMMLTCOPY_M = | |||
CHEMMUTCOPY_M = | |||
CSYMMUCOPY_M = | |||
CSYMMLCOPY_M = | |||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
else | |||
CGEMMINCOPYOBJ = | |||
CGEMMITCOPYOBJ = | |||
endif | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMCOPYLN_M = | |||
ZTRSMCOPYLT_M = | |||
ZTRSMCOPYUN_M = | |||
ZTRSMCOPYUT_M = | |||
ZTRMMUNCOPY_M = | |||
ZTRMMLNCOPY_M = | |||
ZTRMMUTCOPY_M = | |||
ZTRMMLTCOPY_M = | |||
ZHEMMLTCOPY_M = | |||
ZHEMMUTCOPY_M = | |||
ZSYMMUCOPY_M = | |||
ZSYMMLCOPY_M = | |||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
else | |||
ZGEMMINCOPYOBJ = | |||
ZGEMMITCOPYOBJ = | |||
endif | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) |
@@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
add pB, pB, 32 | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_M1 | |||
@@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
add pB, pB, 32 | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_M2 | |||
@@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri z23.s, p1/m, z2.s, z15.s | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
add pB, pB, 32 | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_E | |||
@@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ii z22.s, p1/m, z3.s, z15.s | |||
OP_ri z23.s, p1/m, z2.s, z15.s | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_SUB | |||
@@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ii z22.s, p1/m, z1.s, z15.s | |||
OP_ri z23.s, p1/m, z0.s, z15.s | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
.endm | |||
.macro SAVEv1x4 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
fmla z24.s, p1/m, z16.s, alphaz_R | |||
fmls z24.s, p1/m, z17.s, alphaz_I | |||
@@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
st2w {z26.s, z27.s}, p1, [pCRow1] | |||
add pCRow1, pCRow1, lanes, lsl #3 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
ld2w {z28.s, z29.s}, p1/z, [pCRow2] | |||
fmla z28.s, p1/m, z20.s, alphaz_R | |||
fmls z28.s, p1/m, z21.s, alphaz_I | |||
@@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fmla z31.s, p1/m, z23.s, alphaz_R | |||
st2w {z30.s, z31.s}, p1, [pCRow3] | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
@@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVEv1x2 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
fmla z24.s, p1/m, z16.s, alphaz_R | |||
fmls z24.s, p1/m, z17.s, alphaz_I | |||
@@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
st2w {z26.s, z27.s}, p1, [pCRow1] | |||
add pCRow1, pCRow1, lanes, lsl #3 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
@@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVEv1x1 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
fmla z24.s, p1/m, z16.s, alphaz_R | |||
fmls z24.s, p1/m, z17.s, alphaz_I | |||
@@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
@@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
prfm PLDL1KEEP, [origPB] | |||
prfm PLDL1KEEP, [origPA] | |||
fmov alphaR, s0 | |||
dup alphaz_R, alphaR | |||
fmov alphaI, s1 | |||
@@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
bne .Lcgemm_kernel_L4_Mv1_46 | |||
.Lcgemm_kernel_L4_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x4 | |||
.Lcgemm_kernel_L4_Mv1_END: | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b32(j, n); | |||
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
do { | |||
@@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
aoffset += active * lda * 2; | |||
j += svcntw(); | |||
pg = svwhilelt_b32(j, n); | |||
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b32(j, n); | |||
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
do { | |||
@@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
aoffset += active * 2; | |||
j += svcntw(); | |||
pg = svwhilelt_b32(j, n); | |||
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
@@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { | |||
BLASLONG sve_width = SVE_WIDTH; | |||
for (BLASLONG i = 0; i < n; i += sve_width * 2) { | |||
svbool_t pg_a = SVE_WHILELT(i, n); | |||
svbool_t pg_b = SVE_WHILELT(i + sve_width, n); | |||
svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); | |||
svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); | |||
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); | |||
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); | |||
@@ -0,0 +1,121 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2023, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#ifdef DOUBLE | |||
#define COUNT "cntd" | |||
#define SV_TYPE svfloat64_t | |||
#define SV_INDEX svuint64_t | |||
#define SV_INDEXER svindex_u64 | |||
#define SV_TRUE svptrue_b64 | |||
#define SV_WHILE svwhilelt_b64 | |||
#else | |||
#define COUNT "cntw" | |||
#define SV_TYPE svfloat32_t | |||
#define SV_INDEX svuint32_t | |||
#define SV_INDEXER svindex_u32 | |||
#define SV_TRUE svptrue_b32 | |||
#define SV_WHILE svwhilelt_b32 | |||
#endif | |||
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ | |||
a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \ | |||
a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \ | |||
svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \ | |||
a_offset_inner += 2; \ | |||
b_offset += active * 2; | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
uint64_t sve_size; | |||
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); | |||
IFLOAT *a_offset, *a_offset_inner, *b_offset; | |||
a_offset = a; | |||
b_offset = b; | |||
SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2); | |||
SV_TYPE a_vec_real; | |||
SV_TYPE a_vec_imag; | |||
svbool_t pg_true = SV_TRUE(); | |||
BLASLONG single_vectors_n = n & -sve_size; | |||
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { | |||
a_offset_inner = a_offset; | |||
svbool_t pg = pg_true; | |||
uint64_t active = sve_size; | |||
uint64_t i_cnt = m >> 2; | |||
while (i_cnt--) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
if (m & 2) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
if (m & 1) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
a_offset += sve_size * lda * 2; | |||
} | |||
BLASLONG remaining_n = n - single_vectors_n; | |||
if (remaining_n) { | |||
a_offset_inner = a_offset; | |||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
uint64_t active = remaining_n; | |||
uint64_t i_cnt = m >> 2; | |||
while (i_cnt--) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
if (m & 2) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
if (m & 1) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
} | |||
return 0; | |||
} | |||
@@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
BLASLONG remaining_n = n - single_vectors_n; | |||
if (remaining_n) { | |||
a_offset_inner = a_offset; | |||
svbool_t pg = SV_WHILE(0L, remaining_n); | |||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
uint64_t active = remaining_n; | |||
uint64_t i_cnt = m >> 2; | |||
while (i_cnt--) { | |||
@@ -0,0 +1,115 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2023, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#ifdef DOUBLE | |||
#define COUNT "cntd" | |||
#define SV_TYPE svfloat64x2_t | |||
#define SV_TRUE svptrue_b64 | |||
#define SV_WHILE svwhilelt_b64 | |||
#else | |||
#define COUNT "cntw" | |||
#define SV_TYPE svfloat32x2_t | |||
#define SV_TRUE svptrue_b32 | |||
#define SV_WHILE svwhilelt_b32 | |||
#endif | |||
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ | |||
a_vec = svld2(pg, a_offset_inner); \ | |||
svst2(pg, b_offset, a_vec); \ | |||
a_offset_inner += lda * 2; \ | |||
b_offset += active * 2; | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
uint64_t sve_size = svcntw(); | |||
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); | |||
IFLOAT *a_offset, *a_offset_inner, *b_offset; | |||
a_offset = a; | |||
b_offset = b; | |||
SV_TYPE a_vec; | |||
svbool_t pg_true = SV_TRUE(); | |||
BLASLONG single_vectors_n = n & -sve_size; | |||
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { | |||
a_offset_inner = a_offset; | |||
svbool_t pg = pg_true; | |||
uint64_t active = sve_size; | |||
uint64_t i_cnt = m >> 2; | |||
while (i_cnt--) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
if (m & 2) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
if (m & 1) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
a_offset += sve_size * 2; | |||
} | |||
BLASLONG remaining_n = n - single_vectors_n; | |||
if (remaining_n) { | |||
a_offset_inner = a_offset; | |||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
uint64_t active = remaining_n; | |||
uint64_t i_cnt = m >> 2; | |||
while (i_cnt--) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
if (m & 2) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
if (m & 1) { | |||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
} | |||
} | |||
return 0; | |||
} | |||
@@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
BLASLONG remaining_n = n - single_vectors_n; | |||
if (remaining_n) { | |||
a_offset_inner = a_offset; | |||
svbool_t pg = SV_WHILE(0L, remaining_n); | |||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
uint64_t active = remaining_n; | |||
uint64_t i_cnt = m >> 2; | |||
while (i_cnt--) { | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
@@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
int32_t N = n; | |||
int32_t j = 0; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
@@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
@@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
int32_t N = n; | |||
int32_t j = 0; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
@@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
@@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
FLOAT *ao; | |||
#ifdef DOUBLE | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
@@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
FLOAT *ao; | |||
js = 0; | |||
#ifdef DOUBLE | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
@@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
FLOAT *ao; | |||
#ifdef DOUBLE | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
@@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
FLOAT *ao; | |||
js = 0; | |||
#ifdef DOUBLE | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
@@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
@@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
@@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
@@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -24,7 +24,12 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#ifdef __NVCOMPILER | |||
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) | |||
#if (NVCOMPVERS < 2309) | |||
#pragma opt 1 | |||
#endif | |||
#endif | |||
#include "common.h" | |||
@@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
add pB, pB, 64 | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_M1 | |||
@@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
add pB, pB, 64 | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_M2 | |||
@@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri z23.d, p1/m, z2.d, z15.d | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
add pB, pB, 64 | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_E | |||
@@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ir z23.d, p1/m, z3.d, z14.d | |||
OP_ii z22.d, p1/m, z3.d, z15.d | |||
OP_ri z23.d, p1/m, z2.d, z15.d | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
.endm | |||
.macro KERNELv1x4_SUB | |||
@@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ir z23.d, p1/m, z1.d, z14.d | |||
OP_ii z22.d, p1/m, z1.d, z15.d | |||
OP_ri z23.d, p1/m, z0.d, z15.d | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
.endm | |||
.macro SAVEv1x4 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
fmla z24.d, p1/m, z16.d, alphaz_R | |||
fmls z24.d, p1/m, z17.d, alphaz_I | |||
@@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
st2d {z26.d, z27.d}, p1, [pCRow1] | |||
add pCRow1, pCRow1, lanes, lsl #4 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
ld2d {z28.d, z29.d}, p1/z, [pCRow2] | |||
fmla z28.d, p1/m, z20.d, alphaz_R | |||
@@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fmla z31.d, p1/m, z23.d, alphaz_R | |||
st2d {z30.d, z31.d}, p1, [pCRow3] | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
@@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVEv1x2 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
fmla z24.d, p1/m, z16.d, alphaz_R | |||
fmls z24.d, p1/m, z17.d, alphaz_I | |||
@@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
st2d {z26.d, z27.d}, p1, [pCRow1] | |||
add pCRow1, pCRow1, lanes, lsl #4 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
@@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVEv1x1 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
fmla z24.d, p1/m, z16.d, alphaz_R | |||
fmls z24.d, p1/m, z17.d, alphaz_I | |||
@@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
.endm | |||
/******************************************************************************/ | |||
@@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
prfm PLDL1KEEP, [origPB] | |||
prfm PLDL1KEEP, [origPA] | |||
fmov alphaR, d0 | |||
dup alphaz_R, alphaR | |||
fmov alphaI, d1 | |||
@@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
bne .Lzgemm_kernel_L4_Mv1_46 | |||
.Lzgemm_kernel_L4_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x4 | |||
.Lzgemm_kernel_L4_Mv1_END: | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
do { | |||
@@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
aoffset += active * lda * 2; | |||
j += svcntd(); | |||
pg = svwhilelt_b64(j, n); | |||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
do { | |||
@@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
aoffset += active * 2; | |||
j += svcntd(); | |||
pg = svwhilelt_b64(j, n); | |||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
@@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
if (offset <= 0) { | |||
svbool_t off_g = svwhilelt_b64(offset, 0LL); | |||
svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL); | |||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
} | |||
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
@@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
int32_t j = 0; | |||
int32_t N = n; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
@@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
if (offset <= 0) { | |||
svbool_t off_g = svwhilelt_b32(offset, 0); | |||
svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0); | |||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
} | |||
@@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
@@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
data_vec_imag = svneg_z(pg, data_vec_imag); | |||
if (offset <= 0) { | |||
svbool_t off_g = svwhilelt_b64(offset, 0LL); | |||
svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL); | |||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
} | |||
@@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
#else | |||
@@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
int32_t j = 0; | |||
int32_t N = n; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
@@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
data_vec_imag = svneg_z(pg, data_vec_imag); | |||
if (offset <= 0) { | |||
svbool_t off_g = svwhilelt_b32(offset, 0); | |||
svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0); | |||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
} | |||
@@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
@@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
@@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
int32_t N = n; | |||
int32_t j = 0; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
@@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
@@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
@@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
int32_t N = n; | |||
int32_t j = 0; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
@@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
FLOAT *ao; | |||
#ifdef DOUBLE | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
@@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
FLOAT *ao; | |||
js = 0; | |||
#ifdef DOUBLE | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
@@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
FLOAT *ao; | |||
#ifdef DOUBLE | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
@@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
FLOAT *ao; | |||
js = 0; | |||
#ifdef DOUBLE | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
@@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
@@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -1,5 +1,6 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* Copyright 2023 The OpenBLAS Project */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
@@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
jj = offset; | |||
#ifdef DOUBLE | |||
int64_t js = 0; | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
int32_t N = n; | |||
int32_t js = 0; | |||
svbool_t pn = svwhilelt_b32(js, N); | |||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do { | |||
@@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, N); | |||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
@@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
BLASLONG i, ii, j, jj; | |||
FLOAT data01, data02; | |||
FLOAT data01=0.0, data02=0.0; | |||
FLOAT *a1; | |||
lda *= 2; | |||
@@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
FLOAT data05, data06, data07, data08; | |||
FLOAT *a1, *a2; | |||
data01=data02=data07=data08=0.0; | |||
lda *= 2; | |||
jj = offset; | |||
@@ -1,3 +1,4 @@ | |||
ifndef NO_LASX | |||
DGEMMKERNEL = dgemm_kernel_16x4.S | |||
DGEMMINCOPY = dgemm_ncopy_16.S | |||
DGEMMITCOPY = dgemm_tcopy_16.S | |||
@@ -8,7 +9,26 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMVNKERNEL = dgemv_n_8_lasx.S | |||
DGEMVTKERNEL = dgemv_t_8_lasx.S | |||
SGEMMKERNEL = sgemm_kernel_16x8_lasx.S | |||
SGEMMINCOPY = sgemm_ncopy_16_lasx.S | |||
SGEMMITCOPY = sgemm_tcopy_16_lasx.S | |||
SGEMMONCOPY = sgemm_ncopy_8_lasx.S | |||
SGEMMOTCOPY = sgemm_tcopy_8_lasx.S | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c |
@@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c | |||
ZSWAPKERNEL = ../arm/zswap.c | |||
SGEMVNKERNEL = ../arm/gemv_n.c | |||
ifndef DGEMVNKERNEL | |||
DGEMVNKERNEL = ../arm/gemv_n.c | |||
endif | |||
CGEMVNKERNEL = ../arm/zgemv_n.c | |||
ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
SGEMVTKERNEL = ../arm/gemv_t.c | |||
ifndef DGEMVTKERNEL | |||
DGEMVTKERNEL = ../arm/gemv_t.c | |||
endif | |||
CGEMVTKERNEL = ../arm/zgemv_t.c | |||
ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
@@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fmov.d s2, s1 | |||
bge $r0, N, .L999 | |||
slli.d INCX, INCX, ZBASE_SHIFT | |||
bge $r0, INCX, .L999 | |||
beq $r0, INCX, .L999 | |||
srai.d I, N, 2 | |||
bge $r0, I, .L25 | |||
LD a1, X, 0 * SIZE | |||
@@ -0,0 +1,546 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2023, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/********************************************************************* | |||
* 2023/07/14 guxiwei | |||
* UTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
* | |||
* | |||
*********************************************************************/ | |||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, | |||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
*/ | |||
#define M $r4 | |||
#define N $r5 | |||
#define ALPHA $f0 | |||
#define A $r7 | |||
#define LDA $r8 | |||
#define X $r9 | |||
#define INC_X $r10 | |||
#define Y $r11 | |||
#define INC_Y $r6 | |||
#define J $r12 | |||
#define I $r13 | |||
#define K $r14 | |||
#define Y_ORG $r15 | |||
#define OFFSET $r16 | |||
#define K_LDA $r17 | |||
#define M8 $r18 | |||
#define T0 $r19 | |||
#define PA0 $r20 | |||
#define PA1 $r23 | |||
#define PA2 $r24 | |||
#define PA3 $r25 | |||
#define PA4 $r26 | |||
#define PA5 $r27 | |||
#define PA6 $r28 | |||
#define PA7 $r29 | |||
#define VALPHA $xr1 | |||
#define X0 $xr2 | |||
#define X1 $xr3 | |||
#define X2 $xr4 | |||
#define X3 $xr5 | |||
#define X4 $xr6 | |||
#define X5 $xr7 | |||
#define X6 $xr8 | |||
#define X7 $xr9 | |||
#define Y0 $xr10 | |||
#define Y1 $xr11 | |||
#define A0 $xr12 | |||
#define A1 $xr13 | |||
#define A2 $xr14 | |||
#define A3 $xr15 | |||
#define A4 $xr16 | |||
#define A5 $xr17 | |||
#define A6 $xr18 | |||
#define A7 $xr19 | |||
#define A8 $xr20 | |||
#define A9 $xr21 | |||
#define A10 $xr22 | |||
#define A11 $xr23 | |||
#define A12 $xr24 | |||
#define A13 $xr25 | |||
#define A14 $xr26 | |||
#define A15 $xr27 | |||
.macro DLOAD_X_8 | |||
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ | |||
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 | |||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ | |||
X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA | |||
.endm | |||
.macro DLOAD_X_4 | |||
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 | |||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA | |||
.endm | |||
.macro DLOAD_X_2 | |||
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 | |||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA | |||
.endm | |||
.macro DLOAD_X_1 | |||
GLDREPL xv, d, X0, X, 0x00 | |||
GMUL xvf, d, X0, X0, VALPHA | |||
.endm | |||
.macro DLOAD_Y_8 | |||
GLD xv, , Y0, Y, 0, Y1, Y, 0x20 | |||
.endm | |||
.macro DLOAD_Y_4 | |||
GLD xv, , Y0, Y, 0 | |||
.endm | |||
.macro DLOAD_Y_1 | |||
fld.d $f10, Y, 0 | |||
.endm | |||
.macro DSTORE_Y_8 | |||
GST xv, , Y0, Y, 0, Y1, Y, 0x20 | |||
.endm | |||
.macro DSTORE_Y_4 | |||
GST xv, , Y0, Y, 0 | |||
.endm | |||
.macro DSTORE_Y_1 | |||
fst.d $f10, Y, 0 | |||
.endm | |||
// Unable to use vector load/store ins | |||
.macro DLOAD_Y_8_GAP | |||
fld.d $f10, Y, 0 | |||
fldx.d $f13, Y, INC_Y | |||
PTR_ALSL T0, INC_Y, Y, 1 | |||
fld.d $f14, T0, 0 | |||
fldx.d $f15, T0, INC_Y | |||
PTR_ALSL T0, INC_Y, Y, 2 | |||
fld.d $f11, T0, 0 | |||
fldx.d $f17, T0, INC_Y | |||
PTR_ADD T0, T0, INC_Y | |||
PTR_ADD T0, T0, INC_Y | |||
fld.d $f18, T0, 0 | |||
fldx.d $f19, T0, INC_Y | |||
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 | |||
.endm | |||
.macro DLOAD_Y_4_GAP | |||
fld.d $f10, Y, 0 | |||
fldx.d $f13, Y, INC_Y | |||
PTR_ALSL T0, INC_Y, Y, 1 | |||
fld.d $f14, T0, 0 | |||
fldx.d $f15, T0, INC_Y | |||
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 | |||
.endm | |||
.macro DSTORE_Y_8_GAP | |||
xvstelm.d Y0, Y, 0, 0 | |||
PTR_ADD T0, Y, INC_Y | |||
xvstelm.d Y0, T0, 0, 1 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y0, T0, 0, 2 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y0, T0, 0, 3 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y1, T0, 0, 0 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y1, T0, 0, 1 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y1, T0, 0, 2 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y1, T0, 0, 3 | |||
.endm | |||
.macro DSTORE_Y_4_GAP | |||
xvstelm.d Y0, Y, 0, 0 | |||
PTR_ADD T0, Y, INC_Y | |||
xvstelm.d Y0, T0, 0, 1 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y0, T0, 0, 2 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y0, T0, 0, 3 | |||
.endm | |||
.macro DLOAD_X_8_GAP | |||
xvldrepl.d X0, X, 0x00 | |||
PTR_ADD T0, X, INC_X | |||
xvldrepl.d X1, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X2, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X3, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X4, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X5, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X6, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X7, T0, 0x00 | |||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ | |||
X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA | |||
.endm | |||
.macro DLOAD_X_4_GAP | |||
xvldrepl.d X0, X, 0x00 | |||
PTR_ADD T0, X, INC_X | |||
xvldrepl.d X1, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X2, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X3, T0, 0x00 | |||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA | |||
.endm | |||
.macro DLOAD_X_2_GAP | |||
xvldrepl.d X0, X, 0x00 | |||
PTR_ADD T0, X, INC_X | |||
xvldrepl.d X1, T0, 0x00 | |||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA | |||
.endm | |||
.macro DGEMV_N_8x8 | |||
GLD_INC xv, , 0x20, \ | |||
A0, PA0, 0, A1, PA0, 0, \ | |||
A2, PA1, 0, A3, PA1, 0, \ | |||
A4, PA2, 0, A5, PA2, 0, \ | |||
A6, PA3, 0, A7, PA3, 0, \ | |||
A8, PA4, 0, A9, PA4, 0, \ | |||
A10, PA5, 0, A11, PA5, 0, \ | |||
A12, PA6, 0, A13, PA6, 0, \ | |||
A14, PA7, 0, A15, PA7, 0 | |||
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ | |||
Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ | |||
Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ | |||
Y0, A6, X3, Y0, Y1, A7, X3, Y1, \ | |||
Y0, A8, X4, Y0, Y1, A9, X4, Y1, \ | |||
Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ | |||
Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ | |||
Y0, A14, X7, Y0, Y1, A15, X7, Y1 | |||
.endm | |||
.macro DGEMV_N_4x8 | |||
GLD_INC xv, , 0x20, A0, PA0, 0, \ | |||
A2, PA1, 0, \ | |||
A4, PA2, 0, \ | |||
A6, PA3, 0, \ | |||
A8, PA4, 0, \ | |||
A10, PA5, 0, \ | |||
A12, PA6, 0, \ | |||
A14, PA7, 0 | |||
GMADD xvf, d, Y0, A0, X0, Y0, \ | |||
Y0, A2, X1, Y0, \ | |||
Y0, A4, X2, Y0, \ | |||
Y0, A6, X3, Y0, \ | |||
Y0, A8, X4, Y0, \ | |||
Y0, A10, X5, Y0, \ | |||
Y0, A12, X6, Y0, \ | |||
Y0, A14, X7, Y0 | |||
.endm | |||
.macro DGEMV_N_1x8 | |||
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ | |||
$f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 | |||
GMADD f, d, $f10, $f12, $f2, $f10, \ | |||
$f10, $f14, $f3, $f10, \ | |||
$f10, $f16, $f4, $f10, \ | |||
$f10, $f18, $f5, $f10, \ | |||
$f10, $f20, $f6, $f10, \ | |||
$f10, $f22, $f7, $f10, \ | |||
$f10, $f24, $f8, $f10, \ | |||
$f10, $f26, $f9, $f10, | |||
.endm | |||
.macro DGEMV_N_8x4 | |||
GLD_INC xv, , 0x20, \ | |||
A0, PA0, 0, A1, PA0, 0, \ | |||
A2, PA1, 0, A3, PA1, 0, \ | |||
A4, PA2, 0, A5, PA2, 0, \ | |||
A6, PA3, 0, A7, PA3, 0 | |||
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ | |||
Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ | |||
Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ | |||
Y0, A6, X3, Y0, Y1, A7, X3, Y1 | |||
.endm | |||
.macro DGEMV_N_4x4 | |||
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 | |||
GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \ | |||
Y0, A4, X2, Y0, Y0, A6, X3, Y0 | |||
.endm | |||
.macro DGEMV_N_1x4 | |||
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 | |||
GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ | |||
$f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 | |||
.endm | |||
.macro DGEMV_N_8x2 | |||
GLD_INC xv, , 0x20, \ | |||
A0, PA0, 0, A1, PA0, 0, \ | |||
A2, PA1, 0, A3, PA1, 0 | |||
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ | |||
Y0, A2, X1, Y0, Y1, A3, X1, Y1 | |||
.endm | |||
.macro DGEMV_N_4x2 | |||
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 | |||
GMADD xvf, d, Y0, A0, X0, Y0, \ | |||
Y0, A2, X1, Y0 | |||
.endm | |||
.macro DGEMV_N_1x2 | |||
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 | |||
GMADD f, d, $f10, $f12, $f2, $f10, \ | |||
$f10, $f14, $f3, $f10 | |||
.endm | |||
.macro DGEMV_N_1x1 | |||
fld.d $f12, PA0, 0 | |||
PTR_ADDI PA0, PA0, 0x08 | |||
fmadd.d $f10, $f12, $f2, $f10 | |||
.endm | |||
.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req | |||
PTR_SRLI J, N, 3 | |||
beqz J, .L_\XW\()_N_7 | |||
PTR_SLLI K_LDA, LDA, 3 | |||
PTR_SUB K_LDA, K_LDA, M8 | |||
.L_\XW\()_N_L8: | |||
DLOAD_\X_8 | |||
xor K, K, K | |||
move Y, Y_ORG | |||
PTR_SRLI I, M, 3 | |||
beqz I, .L_\XW\()_M_7 | |||
.align 5 | |||
.L_\XW\()_M_L8: | |||
DLOAD_\Y_8 | |||
DGEMV_N_8x8 | |||
DSTORE_\Y_8 | |||
PTR_ADDI I, I, -1 | |||
PTR_ALSL Y, INC_Y, Y, 3 | |||
PTR_ADDI K, K, 8 | |||
bnez I, .L_\XW\()_M_L8 | |||
.L_\XW\()_M_7: | |||
andi I, M, 4 | |||
beqz I, .L_\XW\()_M_3 | |||
DLOAD_\Y_4 | |||
DGEMV_N_4x8 | |||
DSTORE_\Y_4 | |||
PTR_ALSL Y, INC_Y, Y, 2 | |||
PTR_ADDI K, K, 4 | |||
.L_\XW\()_M_3: | |||
andi I, M, 3 | |||
beqz I, .L_\XW\()_M_END | |||
.align 5 | |||
.L_\XW\()_M_L1: | |||
DLOAD_\Y_1 | |||
DGEMV_N_1x8 | |||
DSTORE_\Y_1 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD Y, Y, INC_Y | |||
PTR_ADDI K, K, 1 | |||
bnez I, .L_\XW\()_M_L1 | |||
.L_\XW\()_M_END: | |||
PTR_ADDI J, J, -1 | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
#else | |||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
#endif | |||
PTR_ALSL X, INC_X, X, 3 | |||
bnez J, .L_\XW\()_N_L8 | |||
.L_\XW\()_N_7: | |||
andi J, N, 4 | |||
beqz J, .L_\XW\()_N_3 | |||
DLOAD_\X_4 | |||
xor K, K, K | |||
move Y, Y_ORG | |||
PTR_SRLI I, M, 3 | |||
beqz I, .L_\XW\()_N_4_M_7 | |||
.align 5 | |||
.L_\XW\()_N_4_M_L8: | |||
DLOAD_\Y_8 | |||
DGEMV_N_8x4 | |||
DSTORE_\Y_8 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADDI K, K, 8 | |||
PTR_ALSL Y, INC_Y, Y, 3 | |||
bnez I, .L_\XW\()_N_4_M_L8 | |||
.L_\XW\()_N_4_M_7: | |||
andi I, M, 4 | |||
beqz I, .L_\XW\()_N_4_M_3 | |||
DLOAD_\Y_4 | |||
DGEMV_N_4x4 | |||
DSTORE_\Y_4 | |||
PTR_ALSL Y, INC_Y, Y, 2 | |||
PTR_ADDI K, K, 4 | |||
.L_\XW\()_N_4_M_3: | |||
andi I, M, 3 | |||
beqz I, .L_\XW\()_N_4_M_END | |||
.align 5 | |||
.L_\XW\()_N_4_M_L1: | |||
DLOAD_\Y_1 | |||
DGEMV_N_1x4 | |||
DSTORE_\Y_1 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD Y, Y, INC_Y | |||
PTR_ADDI K, K, 1 | |||
bnez I, .L_\XW\()_N_4_M_L1 | |||
.L_\XW\()_N_4_M_END: | |||
PTR_SLLI K_LDA, LDA, 2 | |||
PTR_SUB K_LDA, K_LDA, M8 | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
#else | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
#endif | |||
PTR_ALSL X, INC_X, X, 2 | |||
.L_\XW\()_N_3: | |||
andi J, N, 2 | |||
beqz J, .L_\XW\()_N_1 | |||
DLOAD_\X_2 | |||
xor K, K, K | |||
move Y, Y_ORG | |||
PTR_SRLI I, M, 3 | |||
beqz I, .L_\XW\()_N_2_M_7 | |||
.align 5 | |||
.L_\XW\()_N_2_M_L8: | |||
DLOAD_\Y_8 | |||
DGEMV_N_8x2 | |||
DSTORE_\Y_8 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADDI K, K, 8 | |||
PTR_ALSL Y, INC_Y, Y, 3 | |||
bnez I, .L_\XW\()_N_2_M_L8 | |||
.L_\XW\()_N_2_M_7: | |||
andi I, M, 4 | |||
beqz I, .L_\XW\()_N_2_M_3 | |||
DLOAD_\Y_4 | |||
DGEMV_N_4x2 | |||
DSTORE_\Y_4 | |||
PTR_ALSL Y, INC_Y, Y, 2 | |||
PTR_ADDI K, K, 4 | |||
.L_\XW\()_N_2_M_3: | |||
andi I, M, 3 | |||
beqz I, .L_\XW\()_N_2_M_END | |||
.align 5 | |||
.L_\XW\()_N_2_M_L1: | |||
DLOAD_\Y_1 | |||
DGEMV_N_1x2 | |||
DSTORE_\Y_1 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD Y, Y, INC_Y | |||
PTR_ADDI K, K, 1 | |||
bnez I, .L_\XW\()_N_2_M_L1 | |||
.L_\XW\()_N_2_M_END: | |||
PTR_SLLI K_LDA, LDA, 1 | |||
PTR_SUB K_LDA, K_LDA, M8 | |||
PTR_ADD PA0, PA0, K_LDA | |||
PTR_ADD PA1, PA1, K_LDA | |||
PTR_ALSL X, INC_X, X, 1 | |||
.L_\XW\()_N_1: | |||
andi J, N, 1 | |||
beqz J, .L_END | |||
DLOAD_\X_1 | |||
xor K, K, K | |||
move Y, Y_ORG | |||
move I, M | |||
beqz I, .L_END | |||
.align 5 | |||
.L_\XW\()_N_1_M_L1: | |||
DLOAD_\Y_1 | |||
DGEMV_N_1x1 | |||
DSTORE_\Y_1 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD Y, Y, INC_Y | |||
PTR_ADDI K, K, 1 | |||
bnez I, .L_\XW\()_N_1_M_L1 | |||
b .L_END | |||
.endm | |||
PROLOGUE | |||
PTR_LD INC_Y, $sp, 0 | |||
push_if_used 17 + 7, 24 + 4 | |||
PTR_ADDI K, $r0, 0x01 | |||
PTR_SUB I, INC_X, K | |||
PTR_SUB J, INC_Y, K | |||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||
PTR_ALSL I, I, J, 1 | |||
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
xvreplve0.d VALPHA, $xr0 | |||
move Y_ORG, Y | |||
move PA0, A | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
#else | |||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
#endif | |||
la.local T0, .L_GAP_TABLE | |||
PTR_ALSL I, I, T0, 1 | |||
ld.h K, I, 0 | |||
PTR_ADD T0, T0, K | |||
jirl $r0, T0, 0 | |||
.L_GAP_TABLE: | |||
.hword .L_GAP_0_0 - .L_GAP_TABLE | |||
.hword .L_GAP_0_1 - .L_GAP_TABLE | |||
.hword .L_GAP_1_0 - .L_GAP_TABLE | |||
.hword .L_GAP_1_1 - .L_GAP_TABLE | |||
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||
DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 | |||
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||
DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 | |||
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||
DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 | |||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 | |||
.L_END: | |||
pop_if_used 17 + 7, 24 + 4 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE |
@@ -0,0 +1,468 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2023, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/********************************************************************* | |||
* 2023/07/17 guxiwei | |||
* UTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
* | |||
* | |||
*********************************************************************/ | |||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, | |||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
*/ | |||
#define M $r4 | |||
#define N $r5 | |||
#define ALPHA $f0 | |||
#define A $r7 | |||
#define LDA $r8 | |||
#define X $r9 | |||
#define INC_X $r10 | |||
#define Y $r11 | |||
#define INC_Y $r6 | |||
#define J $r12 | |||
#define I $r13 | |||
#define K $r14 | |||
#define PY0 $r14 | |||
#define X_ORG $r15 | |||
#define PY1 $r16 | |||
#define K_LDA $r17 | |||
#define PY2 $r18 | |||
#define T0 $r19 | |||
#define PA0 $r20 | |||
#define PA1 $r23 | |||
#define PA2 $r24 | |||
#define PA3 $r25 | |||
#define PA4 $r26 | |||
#define PA5 $r27 | |||
#define PA6 $r28 | |||
#define PA7 $r29 | |||
#define M8 $r30 | |||
#define VALPHA $xr0 | |||
#define X0 $xr1 | |||
#define X1 $xr2 | |||
#define A0 $xr3 | |||
#define A1 $xr4 | |||
#define A2 $xr5 | |||
#define A3 $xr6 | |||
#define A4 $xr7 | |||
#define A5 $xr8 | |||
#define A6 $xr9 | |||
#define A7 $xr10 | |||
#define A8 $xr11 | |||
#define A9 $xr12 | |||
#define A10 $xr13 | |||
#define A11 $xr14 | |||
#define A12 $xr15 | |||
#define A13 $xr16 | |||
#define A14 $xr17 | |||
#define A15 $xr18 | |||
#define TP0 $xr19 | |||
#define TP1 $xr20 | |||
#define TP2 $xr21 | |||
#define TP3 $xr22 | |||
#define TP4 $xr23 | |||
#define TP5 $xr24 | |||
#define TP6 $xr25 | |||
#define TP7 $xr26 | |||
#define Y0 $xr3 | |||
#define Y1 $xr4 | |||
#define Y2 $xr5 | |||
#define Y3 $xr6 | |||
#define Y4 $xr7 | |||
#define Y5 $xr8 | |||
#define Y6 $xr9 | |||
#define Y7 $xr10 | |||
.macro ZERO_Y8 | |||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ | |||
TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 | |||
.endm | |||
.macro ZERO_Y4 | |||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 | |||
.endm | |||
.macro ZERO_Y2 | |||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 | |||
.endm | |||
.macro ZERO_Y1 | |||
GXOR xv, v, TP0, TP0, TP0 | |||
.endm | |||
.macro DLOAD_X8 | |||
GLD xv, , X0, X, 0x00, X1, X, 0x20 | |||
.endm | |||
.macro DLOAD_X4 | |||
GLD xv, , X0, X, 0x00 | |||
.endm | |||
.macro DLOAD_X8_GAP | |||
fld.d $f1, X, 0x00 | |||
fldx.d $f2, X, INC_X | |||
PTR_ALSL T0, INC_X, X, 1 | |||
fld.d $f3, T0, 0x00 | |||
fldx.d $f4, T0, INC_X | |||
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 | |||
PTR_ALSL T0, INC_X, X, 2 | |||
fld.d $f2, T0, 0x00 | |||
fldx.d $f3, T0, INC_X | |||
PTR_ALSL T0, INC_X, T0, 1 | |||
fld.d $f4, T0, 0x00 | |||
fldx.d $f5, T0, INC_X | |||
GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 | |||
.endm | |||
.macro DLOAD_X4_GAP | |||
fld.d $f1, X, 0x00 | |||
fldx.d $f2, X, INC_X | |||
PTR_ALSL T0, INC_X, X, 1 | |||
fld.d $f3, T0, 0x00 | |||
fldx.d $f4, T0, INC_X | |||
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 | |||
.endm | |||
.macro DGEMV_T_8x8 | |||
GLD_INC xv, , 0x20, \ | |||
A0, PA0, 0, A1, PA0, 0, \ | |||
A2, PA1, 0, A3, PA1, 0, \ | |||
A4, PA2, 0, A5, PA2, 0, \ | |||
A6, PA3, 0, A7, PA3, 0, \ | |||
A8, PA4, 0, A9, PA4, 0, \ | |||
A10, PA5, 0, A11, PA5, 0, \ | |||
A12, PA6, 0, A13, PA6, 0, \ | |||
A14, PA7, 0, A15, PA7, 0 | |||
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ | |||
TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ | |||
TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ | |||
TP3, A6, X0, TP3, TP3, A7, X1, TP3, \ | |||
TP4, A8, X0, TP4, TP4, A9, X1, TP4, \ | |||
TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ | |||
TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ | |||
TP7, A14, X0, TP7, TP7, A15, X1, TP7 | |||
.endm | |||
.macro DGEMV_T_8x4 | |||
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \ | |||
A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 | |||
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ | |||
TP2, A4, X0, TP2, TP3, A6, X0, TP3, \ | |||
TP4, A8, X0, TP4, TP5, A10, X0, TP5, \ | |||
TP6, A12, X0, TP6, TP7, A14, X0, TP7, | |||
.endm | |||
.macro DGEMV_T_4x8 | |||
GLD_INC xv, , 0x20, \ | |||
A0, PA0, 0, A1, PA0, 0, \ | |||
A2, PA1, 0, A3, PA1, 0, \ | |||
A4, PA2, 0, A5, PA2, 0, \ | |||
A6, PA3, 0, A7, PA3, 0 | |||
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ | |||
TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ | |||
TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ | |||
TP3, A6, X0, TP3, TP3, A7, X1, TP3 | |||
.endm | |||
.macro DGEMV_T_4x4 | |||
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 | |||
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ | |||
TP2, A4, X0, TP2, TP3, A6, X0, TP3 | |||
.endm | |||
.macro DGEMV_T_2x8 | |||
GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0 | |||
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ | |||
TP1, A2, X0, TP1, TP1, A3, X1, TP1 | |||
.endm | |||
.macro DGEMV_T_2x4 | |||
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 | |||
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 | |||
.endm | |||
.macro DGEMV_T XW:req X8:req, X4:req | |||
PTR_SRLI J, N, 3 | |||
beqz J, .L_\XW\()_N_7 | |||
PTR_SLLI K_LDA, LDA, 3 | |||
PTR_SUB K_LDA, K_LDA, M8 | |||
.L_\XW\()_N_L8: | |||
ZERO_Y8 | |||
move X, X_ORG | |||
PTR_SRLI I, M, 3 | |||
beqz I, .L_\XW\()_M_7 | |||
.align 5 | |||
.L_\XW\()_M_L8: | |||
DLOAD_\X8 | |||
DGEMV_T_8x8 | |||
PTR_ADDI I, I, -1 | |||
PTR_ALSL X, INC_X, X, 3 | |||
bnez I, .L_\XW\()_M_L8 | |||
.L_\XW\()_M_7: | |||
andi I, M, 4 | |||
beqz I, .L_\XW\()_M_3 | |||
DLOAD_\X4 | |||
DGEMV_T_8x4 | |||
PTR_ALSL X, INC_X, X, 2 | |||
.L_\XW\()_M_3: | |||
// Accumulated | |||
GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ | |||
Y5, TP5, Y6, TP6, Y7, TP7 | |||
andi I, M, 3 | |||
beqz I, .L_\XW\()_M_END | |||
.align 5 | |||
.L_\XW\()_M_L1: | |||
fld.d $f1, X, 0x00 | |||
fld.d $f11, PA0, 0x00 | |||
fld.d $f12, PA1, 0x00 | |||
fld.d $f13, PA2, 0x00 | |||
fld.d $f14, PA3, 0x00 | |||
fld.d $f15, PA4, 0x00 | |||
fld.d $f16, PA5, 0x00 | |||
fld.d $f17, PA6, 0x00 | |||
fld.d $f18, PA7, 0x00 | |||
#if __loongarch_grlen == 64 | |||
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | |||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | |||
#else | |||
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | |||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | |||
#endif | |||
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ | |||
$f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD X, X, INC_X | |||
bnez I, .L_\XW\()_M_L1 | |||
.L_\XW\()_M_END: | |||
fld.d $f11, Y, 0x00 | |||
fldx.d $f12, Y, INC_Y | |||
PTR_ALSL PY0, INC_Y, Y, 1 | |||
fld.d $f13, PY0, 0x00 | |||
fldx.d $f14, PY0, INC_Y | |||
PTR_ALSL PY1, INC_Y, Y, 2 | |||
fld.d $f15, PY1, 0x00 | |||
fldx.d $f16, PY1, INC_Y | |||
PTR_ALSL PY2, INC_Y, PY1, 1 | |||
fld.d $f17, PY2, 0x00 | |||
fldx.d $f18, PY2, INC_Y | |||
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ | |||
$f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 | |||
PTR_ADDI J, J, -1 | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
#else | |||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
#endif | |||
fst.d $f11, Y, 0x00 | |||
fstx.d $f12, Y, INC_Y | |||
fst.d $f13, PY0, 0x00 | |||
fstx.d $f14, PY0, INC_Y | |||
fst.d $f15, PY1, 0x00 | |||
fstx.d $f16, PY1, INC_Y | |||
fst.d $f17, PY2, 0x00 | |||
fstx.d $f18, PY2, INC_Y | |||
PTR_ALSL Y, INC_Y, Y, 3 | |||
bnez J, .L_\XW\()_N_L8 | |||
.L_\XW\()_N_7: | |||
andi J, N, 4 | |||
beqz J, .L_\XW\()_N_3 | |||
ZERO_Y4 | |||
move X, X_ORG | |||
PTR_SRLI I, M, 3 | |||
beqz I, .L_\XW\()_N_4_M_7 | |||
.align 5 | |||
.L_\XW\()_N_4_M_L8: | |||
DLOAD_\X8 | |||
DGEMV_T_4x8 | |||
PTR_ADDI I, I, -1 | |||
PTR_ALSL X, INC_X, X, 3 | |||
bnez I, .L_\XW\()_N_4_M_L8 | |||
.L_\XW\()_N_4_M_7: | |||
andi I, M, 4 | |||
beqz I, .L_\XW\()_N_4_M_3 | |||
DLOAD_\X4 | |||
DGEMV_T_4x4 | |||
PTR_ALSL X, INC_X, X, 2 | |||
.L_\XW\()_N_4_M_3: | |||
// Accumulated | |||
GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 | |||
andi I, M, 3 | |||
beqz I, .L_\XW\()_N_4_M_END | |||
.align 5 | |||
.L_\XW\()_N_4_M_L1: | |||
fld.d $f1, X, 0x00 | |||
GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 | |||
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD X, X, INC_X | |||
bnez I, .L_\XW\()_N_4_M_L1 | |||
.L_\XW\()_N_4_M_END: | |||
fld.d $f11, Y, 0x00 | |||
fldx.d $f12, Y, INC_Y | |||
PTR_ALSL PY0, INC_Y, Y, 1 | |||
fld.d $f13, PY0, 0x00 | |||
fldx.d $f14, PY0, INC_Y | |||
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 | |||
PTR_SLLI K_LDA, LDA, 2 | |||
PTR_SUB K_LDA, K_LDA, M8 | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
#else | |||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
#endif | |||
fst.d $f11, Y, 0x00 | |||
fstx.d $f12, Y, INC_Y | |||
fst.d $f13, PY0, 0x00 | |||
fstx.d $f14, PY0, INC_Y | |||
PTR_ALSL Y, INC_Y, Y, 2 | |||
.L_\XW\()_N_3: | |||
andi J, N, 2 | |||
beqz J, .L_\XW\()_N_1 | |||
ZERO_Y2 | |||
move X, X_ORG | |||
PTR_SRLI I, M, 3 | |||
beqz I, .L_\XW\()_N_2_M_7 | |||
.align 5 | |||
.L_\XW\()_N_2_M_L8: | |||
DLOAD_\X8 | |||
DGEMV_T_2x8 | |||
PTR_ADDI I, I, -1 | |||
PTR_ALSL X, INC_X, X, 3 | |||
bnez I, .L_\XW\()_N_2_M_L8 | |||
.L_\XW\()_N_2_M_7: | |||
andi I, M, 4 | |||
beqz I, .L_\XW\()_N_2_M_3 | |||
DLOAD_\X4 | |||
DGEMV_T_2x4 | |||
PTR_ALSL X, INC_X, X, 2 | |||
.L_\XW\()_N_2_M_3: | |||
// Accumulated | |||
GACC xvf, d, Y0, TP0, Y1, TP1 | |||
andi I, M, 3 | |||
beqz I, .L_\XW\()_N_2_M_END | |||
.align 5 | |||
.L_\XW\()_N_2_M_L1: | |||
fld.d $f1, X, 0x00 | |||
GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 | |||
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD X, X, INC_X | |||
bnez I, .L_\XW\()_N_2_M_L1 | |||
.L_\XW\()_N_2_M_END: | |||
fld.d $f11, Y, 0x00 | |||
fldx.d $f12, Y, INC_Y | |||
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 | |||
PTR_SLLI K_LDA, LDA, 1 | |||
PTR_SUB K_LDA, K_LDA, M8 | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
#else | |||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
#endif | |||
fst.d $f11, Y, 0x00 | |||
fstx.d $f12, Y, INC_Y | |||
PTR_ALSL Y, INC_Y, Y, 1 | |||
.L_\XW\()_N_1: | |||
andi J, N, 1 | |||
beqz J, .L_END | |||
ZERO_Y1 | |||
move X, X_ORG | |||
move I, M | |||
beqz I, .L_END | |||
.align 5 | |||
.L_\XW\()_N_1_M_L1: | |||
fld.d $f3, PA0, 0x00 | |||
fld.d $f1, X, 0x00 | |||
fmadd.d $f19, $f3, $f1, $f19 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD X, X, INC_X | |||
PTR_ADDI PA0, PA0, 0x08 | |||
bnez I, .L_\XW\()_N_1_M_L1 | |||
fld.d $f3, Y, 0x00 | |||
fmadd.d $f3, ALPHA, $f19, $f3 | |||
fst.d $f3, Y, 0x00 | |||
b .L_END | |||
.endm | |||
PROLOGUE | |||
PTR_LD INC_Y, $sp, 0 | |||
push_if_used 17 + 8, 24 + 3 | |||
PTR_ADDI K, $r0, 0x01 | |||
PTR_SUB I, INC_X, K | |||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
xvreplve0.d VALPHA, $xr0 | |||
move X_ORG, X | |||
move PA0, A | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
#else | |||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
#endif | |||
la.local T0, .L_GAP_TABLE | |||
PTR_ALSL I, I, T0, 1 | |||
ld.h K, I, 0 | |||
PTR_ADD T0, T0, K | |||
jirl $r0, T0, 0 | |||
.L_GAP_TABLE: | |||
.hword .L_GAP_0 - .L_GAP_TABLE | |||
.hword .L_GAP_1 - .L_GAP_TABLE | |||
.L_GAP_0: /* if (incx == 1) */ | |||
DGEMV_T GAP_0, X8, X4 | |||
.L_GAP_1: /* if (incx != 1) */ | |||
DGEMV_T GAP_1, X8_GAP, X4_GAP | |||
.L_END: | |||
pop_if_used 17 + 8, 24 + 3 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE |
@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
MTC s1, $r0 | |||
bge $r0, N, .L999 | |||
slli.d INCX, INCX, BASE_SHIFT | |||
bge $r0, INCX, .L999 | |||
beq $r0, INCX, .L999 | |||
move XX, X | |||
NOP | |||
LD a1, X, 0 * SIZE | |||
@@ -0,0 +1,407 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2023, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#if __loongarch_grlen == 64 | |||
#define LA_REG int64_t | |||
#define REG_SIZE 8 | |||
#define REG_LOG 3 | |||
#define PTR_ADDI addi.d | |||
#define PTR_ADD add.d | |||
#define PTR_SUB sub.d | |||
#define PTR_LD ld.d | |||
#define PTR_ST st.d | |||
#define PTR_SLLI slli.d | |||
#define PTR_SRLI srli.d | |||
#define PTR_SRAI srai.d | |||
#define PTR_MUL mul.d | |||
#define PTR_ALSL alsl.d | |||
#else | |||
#define LA_REG int32_t | |||
#define REG_SIZE 4 | |||
#define REG_LOG 2 | |||
#define PTR_ADDI addi.w | |||
#define PTR_ADD add.w | |||
#define PTR_SUB sub.w | |||
#define PTR_LD ld.w | |||
#define PTR_ST st.w | |||
#define PTR_SLLI slli.w | |||
#define PTR_SRLI srli.w | |||
#define PTR_SRAI srai.w | |||
#define PTR_MUL mul.w | |||
#define PTR_ALSL alsl.w | |||
#endif | |||
#if __loongarch_frlen == 64 | |||
#define FREG_SIZE 8 | |||
#define FREG_LOG 3 | |||
#define PTR_FLD fld.d | |||
#define PTR_FST fst.d | |||
#else | |||
#define FREG_SIZE 4 | |||
#define FREG_LOG 2 | |||
#define PTR_FLD fld.s | |||
#define PTR_FST fst.s | |||
#endif | |||
// The max registers available to the user which | |||
// do not need to be preserved across calls. | |||
// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html | |||
#define MAX_INT_CALLER_SAVED 17 | |||
#define MAX_FP_CALLER_SAVED 24 | |||
.altmacro // Enable alternate macro mode | |||
.macro push_if_used regs, fregs | |||
.if \regs > MAX_INT_CALLER_SAVED | |||
PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) | |||
push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | |||
.endif | |||
.if \fregs > MAX_FP_CALLER_SAVED | |||
PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) | |||
push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | |||
.endif | |||
.endm // End push_if_used | |||
.macro pop_if_used regs, fregs | |||
.if \fregs > MAX_FP_CALLER_SAVED | |||
pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | |||
PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG | |||
.endif | |||
.if \regs > MAX_INT_CALLER_SAVED | |||
pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | |||
PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG | |||
.endif | |||
.endm // End pop_if_used | |||
.macro push_regs from, to | |||
PTR_ST $s\()\from, $sp, \from << REG_LOG | |||
.if \to - \from | |||
push_regs %from + 1, \to | |||
.endif | |||
.endm // End push_regs | |||
.macro pop_regs from, to | |||
PTR_LD $s\()\from, $sp, \from << REG_LOG | |||
.if \to - \from | |||
pop_regs %from + 1, \to | |||
.endif | |||
.endm // End pop_regs | |||
.macro push_fregs from, to | |||
PTR_FST $fs\()\from, $sp, \from << FREG_LOG | |||
.if \to - \from | |||
push_fregs %from + 1, \to | |||
.endif | |||
.endm // End push_fregs | |||
.macro pop_fregs from, to | |||
PTR_FLD $fs\()\from, $sp, \from << FREG_LOG | |||
.if \to - \from | |||
pop_fregs %from + 1, \to | |||
.endif | |||
.endm // End pop_fregs | |||
// | |||
// Instruction Related Macros | |||
// | |||
// GLD | |||
// | |||
.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg | |||
.ifeqs "\suf_op", "0" | |||
\pre_op\()ld \out, \src, \offset | |||
.else | |||
\pre_op\()ld.\suf_op \out, \src, \offset | |||
.endif | |||
.ifnb \more | |||
GLD \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GLD_INC | |||
// | |||
.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg | |||
.ifeqs "\suf_op", "0" | |||
\pre_op\()ld \out, \src, \offset | |||
.else | |||
\pre_op\()ld.\suf_op \out, \src, \offset | |||
.endif | |||
PTR_ADDI \src, \src, \inc | |||
.ifnb \more | |||
GLD_INC \pre_op, \suf_op, \inc, \more | |||
.endif | |||
.endm | |||
// | |||
// GLDX is same as GLD except the stride is a register | |||
// | |||
.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg | |||
.ifeqs "\suf_op", "0" | |||
\pre_op\()ldx \out, \src, \offset | |||
.else | |||
\pre_op\()ldx.\suf_op \out, \src, \offset | |||
.endif | |||
.ifnb \more | |||
GLDX \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GLDREPL | |||
// | |||
.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg | |||
\pre_op\()ldrepl.\suf_op \out, \src, \offset | |||
.ifnb \more | |||
GLDREPL \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GST | |||
// | |||
.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg | |||
.ifeqs "\suf_op", "0" | |||
\pre_op\()st \src, \dst, \offset | |||
.else | |||
\pre_op\()st.\suf_op \src, \dst, \offset | |||
.endif | |||
.ifnb \more | |||
GST \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GMUL | |||
// | |||
.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
\pre_op\()mul.\suf_op \out, \in0, \in1 | |||
.ifnb \more | |||
GMUL \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GMADD | |||
// | |||
.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg | |||
\pre_op\()madd.\suf_op \out, \in0, \in1, \in2 | |||
.ifnb \more | |||
GMADD \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GADD | |||
// | |||
.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
\pre_op\()add.\suf_op \out, \in0, \in1 | |||
.ifnb \more | |||
GADD \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GADDI | |||
// | |||
.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
\pre_op\()addi.\suf_op \out, \in0, \in1 | |||
.ifnb \more | |||
GADDI \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GSUB | |||
// | |||
.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
\pre_op\()sub.\suf_op \out, \in0, \in1 | |||
.ifnb \more | |||
GSUB \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GSLLI | |||
// | |||
.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
\pre_op\()slli.\suf_op \out, \in0, \in1 | |||
.ifnb \more | |||
GSLLI \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GINSVE0 | |||
// | |||
.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
\pre_op\()insve0.\suf_op \out, \in0, \in1 | |||
.ifnb \more | |||
GINSVE0 \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GXOR | |||
// | |||
.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
\pre_op\()xor.\suf_op \out, \in0, \in1 | |||
.ifnb \more | |||
GXOR \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GPERMI | |||
// | |||
.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
\pre_op\()permi.\suf_op \out, \in0, \in1 | |||
.ifnb \more | |||
GPERMI \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GNMSUB | |||
// | |||
.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg | |||
\pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2 | |||
.ifnb \more | |||
GNMSUB \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GPRELD | |||
// | |||
.macro GPRELD in0:req, in1:req, in2:req, more:vararg | |||
preld \in0, \in1, \in2 | |||
.ifnb \more | |||
GPRELD \more | |||
.endif | |||
.endm | |||
// | |||
// Compound instructions | |||
// | |||
// GACC: Accumulate the values of vector registers | |||
// | |||
.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | |||
.ifeqs "\pre_op", "xvf" | |||
xvpermi.q \out, \in, 0x01 | |||
\pre_op\()add.\suf_op \in, \out, \in | |||
xvpackod.d \out, \in, \in | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.ifeqs "\suf_op", "s" | |||
xvpackod.w \in, \out, \out | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.endif | |||
.endif | |||
.ifeqs "\pre_op", "vf" | |||
vpackod.d \out, \in, \in | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.ifeqs "\suf_op", "s" | |||
vpackod.w \in, \out, \out | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.endif | |||
.endif | |||
.ifeqs "\pre_op", "xv" | |||
xvpermi.q \out, \in, 0x01 | |||
\pre_op\()add.\suf_op \in, \out, \in | |||
xvpackod.d \out, \in, \in | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.ifnc "\suf_op", "d" | |||
xvpackod.w \in, \out, \out | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.ifnc "\suf_op", "w" | |||
xvpackod.h \in, \out, \out | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.ifnc "\suf_op", "h" | |||
xvpackod.b \in, \out, \out | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.endif | |||
.endif | |||
.endif | |||
.endif | |||
.ifeqs "\pre_op", "v" | |||
vpackod.d \out, \in, \in | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.ifnc "\suf_op", "d" | |||
vpackod.w \in, \out, \out | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.ifnc "\suf_op", "w" | |||
vpackod.h \in, \out, \out | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.ifnc "\suf_op", "h" | |||
vpackod.b \in, \out, \out | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.endif | |||
.endif | |||
.endif | |||
.endif | |||
.ifnb \more | |||
GACC \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GMOV | |||
// | |||
.macro GMOV pre_op:req, out:req, in:req, more:vararg | |||
\pre_op\()or.v \out, \in, \in | |||
.ifnb \more | |||
GMOV \pre_op, \more | |||
.endif | |||
.endm | |||
// | |||
// Media Related Macros | |||
// | |||
.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1 | |||
\pre_op\()ilvl.\suf_op \out0, \in0, \in1 | |||
\pre_op\()ilvh.\suf_op \out1, \in0, \in1 | |||
.endm | |||
.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1 | |||
\pre_op\()pickev.\suf_op \out0, \in0, \in1 | |||
\pre_op\()pickod.\suf_op \out1, \in0, \in1 | |||
.endm | |||
// | |||
// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors, | |||
// has no pre_op param. 128-bit vector instructions are not supported. | |||
// | |||
.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ | |||
vt0, vt1 | |||
GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0 | |||
GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2 | |||
GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3 | |||
GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02 | |||
.endm | |||
.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \ | |||
in0, in1, in2, in3, in4, in5, in6, in7, \ | |||
tmp0, tmp1, tmp2, tmp3 | |||
GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0 | |||
GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1 | |||
GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0 | |||
GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2 | |||
GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4 | |||
GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5 | |||
GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0 | |||
GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2 | |||
GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3 | |||
GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \ | |||
\out2, \out6, 0x02, \out3, \out7, 0x02, \ | |||
\out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \ | |||
\out6, \tmp2, 0x31, \out7, \tmp3, 0x31 | |||
.endm |
@@ -0,0 +1,463 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2023, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/********************************************************************* | |||
* 2023/08/23 guxiwei | |||
* UTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
*********************************************************************/ | |||
/* Function parameters */ | |||
#define M $r4 // param 1: m | |||
#define N $r5 // param 2: n | |||
#define SRC $r6 // param 3: src | |||
#define LDA $r7 // param 4: lda | |||
#define DST $r8 // param 5: dst | |||
#define I $r9 | |||
#define J $r10 | |||
#define S1 $r12 | |||
#define S2 $r13 | |||
#define S3 $r14 | |||
#define S4 $r15 | |||
#define S5 $r16 | |||
#define S6 $r17 | |||
#define S7 $r18 | |||
#define S8 $r19 | |||
#define S9 $r20 | |||
#define S10 $r23 | |||
#define S11 $r24 | |||
#define S12 $r25 | |||
#define S13 $r26 | |||
#define S14 $r27 | |||
#define S15 $r28 | |||
#define S16 $r29 | |||
#define TD $r30 | |||
#define TS $r31 | |||
#define TL $r7 | |||
#define T0 $r6 | |||
#undef ZERO | |||
#define ZERO $r0 | |||
#define F0 $f0 | |||
#define F1 $f1 | |||
#define F2 $f2 | |||
#define F3 $f3 | |||
#define F4 $f4 | |||
#define F5 $f5 | |||
#define F6 $f6 | |||
#define F7 $f7 | |||
/* LASX vectors */ | |||
#define U0 $xr0 | |||
#define U1 $xr1 | |||
#define U2 $xr2 | |||
#define U3 $xr3 | |||
#define U4 $xr4 | |||
#define U5 $xr5 | |||
#define U6 $xr6 | |||
#define U7 $xr7 | |||
#define U8 $xr8 | |||
#define U9 $xr9 | |||
#define U10 $xr10 | |||
#define U11 $xr11 | |||
#define U12 $xr12 | |||
#define U13 $xr13 | |||
#define U14 $xr14 | |||
#define U15 $xr15 | |||
#define D0 $xr16 | |||
#define D1 $xr17 | |||
#define D2 $xr18 | |||
#define D3 $xr19 | |||
#define D4 $xr20 | |||
#define D5 $xr21 | |||
#define D6 $xr22 | |||
#define D7 $xr23 | |||
#define D8 $xr24 | |||
#define D9 $xr25 | |||
#define D10 $xr26 | |||
#define D11 $xr27 | |||
#define D12 $xr28 | |||
#define D13 $xr29 | |||
#define D14 $xr30 | |||
#define D15 $xr31 | |||
// Loops outline | |||
//.L_N16 <------------------- | |||
//| .L_M8: | | |||
//| .L_M7: | Main Loop | |||
//| .L_M1: | | |||
//| .L_M0: --------------- | |||
//.L_N15: | |||
//.L_N8: | |||
//| .L_N8_M8: | |||
//| .L_N8_M7: | |||
//| .L_N8_M1: | |||
//.L_N7: | |||
//.L_N4: | |||
//| .L_N4_M4: | |||
//| .L_N4_M3: | |||
//| .L_N4_M1: | |||
//.L_N3: | |||
//.L_N2: | |||
//| .L_N2_M2: | |||
//| .L_N2_M1: | |||
//.L_N1: | |||
//| .L_N1_M1: | |||
//.L_N0 | |||
PROLOGUE | |||
push_if_used 26, 32 | |||
move TD, DST | |||
move TS, SRC | |||
PTR_SLLI TL, LDA, 0x02 | |||
PTR_SLLI T0, TL, 0x01 | |||
PTR_SRAI J, N, 0x04 | |||
beq J, ZERO, .L_N15 | |||
.align 5 | |||
.L_N16: | |||
move S1, TS | |||
PTR_ADD S2, TS, TL | |||
PTR_SRAI I, M, 0x03 | |||
PTR_ADD S3, S2, TL | |||
PTR_ADDI J, J, -1 | |||
PTR_ADD S4, S3, TL | |||
PTR_ADD S5, S3, T0 | |||
PTR_ADD S6, S4, T0 | |||
PTR_ADD S7, S5, T0 | |||
PTR_ADD S8, S6, T0 | |||
PTR_ADD S9, S7, T0 | |||
PTR_ADD S10, S8, T0 | |||
PTR_ADD S11, S9, T0 | |||
PTR_ADD S12, S10, T0 | |||
PTR_ADD S13, S11, T0 | |||
PTR_ADD S14, S12, T0 | |||
PTR_ADD S15, S13, T0 | |||
PTR_ADD S16, S14, T0 | |||
PTR_ADD TS, S15, T0 | |||
beq I, ZERO, .L_M7 | |||
.align 5 | |||
.L_M8: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvld U4, S5, 0x00 | |||
xvld U5, S6, 0x00 | |||
xvld U6, S7, 0x00 | |||
xvld U7, S8, 0x00 | |||
xvld U8, S9, 0x00 | |||
xvld U9, S10, 0x00 | |||
xvld U10, S11, 0x00 | |||
xvld U11, S12, 0x00 | |||
xvld U12, S13, 0x00 | |||
xvld U13, S14, 0x00 | |||
xvld U14, S15, 0x00 | |||
xvld U15, S16, 0x00 | |||
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ | |||
U0, U1, U2, U3, U4, U5, U6, U7, \ | |||
D1, D3, D5, D7 // As tmp | |||
GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \ | |||
U8, U9, U10, U11, U12, U13, U14, U15, \ | |||
U0, U1, U2, U3 // As tmp | |||
GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \ | |||
D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0 | |||
PTR_ADDI TD, TD, 0x100 | |||
GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \ | |||
D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0 | |||
PTR_ADDI TD, TD, 0x100 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI S2, S2, 0x20 | |||
PTR_ADDI S3, S3, 0x20 | |||
PTR_ADDI S4, S4, 0x20 | |||
PTR_ADDI S5, S5, 0x20 | |||
PTR_ADDI S6, S6, 0x20 | |||
PTR_ADDI S7, S7, 0x20 | |||
PTR_ADDI S8, S8, 0x20 | |||
PTR_ADDI S9, S9, 0x20 | |||
PTR_ADDI S10, S10, 0x20 | |||
PTR_ADDI S11, S11, 0x20 | |||
PTR_ADDI S12, S12, 0x20 | |||
PTR_ADDI S13, S13, 0x20 | |||
PTR_ADDI S14, S14, 0x20 | |||
PTR_ADDI S15, S15, 0x20 | |||
PTR_ADDI S16, S16, 0x20 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_M8 | |||
.L_M7: | |||
andi I, M, 0x07 | |||
beq I, ZERO, .L_M0 | |||
.align 5 | |||
.L_M1: | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S2, 0x00 | |||
fld.s F2, S3, 0x00 | |||
fld.s F3, S4, 0x00 | |||
fld.s F4, S5, 0x00 | |||
fld.s F5, S6, 0x00 | |||
fld.s F6, S7, 0x00 | |||
fld.s F7, S8, 0x00 | |||
fst.s F0, TD, 0x00 | |||
fst.s F1, TD, 0x04 | |||
fst.s F2, TD, 0x08 | |||
fst.s F3, TD, 0x0C | |||
fst.s F4, TD, 0x10 | |||
fst.s F5, TD, 0x14 | |||
fst.s F6, TD, 0x18 | |||
fst.s F7, TD, 0x1C | |||
PTR_ADDI S1, S1, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
PTR_ADDI S3, S3, 0x04 | |||
PTR_ADDI S4, S4, 0x04 | |||
PTR_ADDI S5, S5, 0x04 | |||
PTR_ADDI S6, S6, 0x04 | |||
PTR_ADDI S7, S7, 0x04 | |||
PTR_ADDI S8, S8, 0x04 | |||
PTR_ADDI TD, TD, 0x20 | |||
fld.s F0, S9, 0x00 | |||
fld.s F1, S10, 0x00 | |||
fld.s F2, S11, 0x00 | |||
fld.s F3, S12, 0x00 | |||
fld.s F4, S13, 0x00 | |||
fld.s F5, S14, 0x00 | |||
fld.s F6, S15, 0x00 | |||
fld.s F7, S16, 0x00 | |||
fst.s F0, TD, 0x00 | |||
fst.s F1, TD, 0x04 | |||
fst.s F2, TD, 0x08 | |||
fst.s F3, TD, 0x0C | |||
fst.s F4, TD, 0x10 | |||
fst.s F5, TD, 0x14 | |||
fst.s F6, TD, 0x18 | |||
fst.s F7, TD, 0x1C | |||
PTR_ADDI S9, S9, 0x04 | |||
PTR_ADDI S10, S10, 0x04 | |||
PTR_ADDI S11, S11, 0x04 | |||
PTR_ADDI S12, S12, 0x04 | |||
PTR_ADDI S13, S13, 0x04 | |||
PTR_ADDI S14, S14, 0x04 | |||
PTR_ADDI S15, S15, 0x04 | |||
PTR_ADDI S16, S16, 0x04 | |||
PTR_ADDI TD, TD, 0x20 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_M1 | |||
.L_M0: | |||
blt ZERO, J, .L_N16 | |||
.L_N15: | |||
andi J, N, 0x0f | |||
beq ZERO, J, .L_N0 | |||
andi J, N, 0x08 | |||
beq ZERO, J, .L_N7 | |||
.L_N8: | |||
move S1, TS | |||
PTR_ADD S2, TS, TL | |||
PTR_SRAI I, M, 0x03 | |||
PTR_ADD S3, S2, TL | |||
PTR_ADD S4, S2, T0 | |||
PTR_ADD S5, S3, T0 | |||
PTR_ADD S6, S4, T0 | |||
PTR_ADD S7, S5, T0 | |||
PTR_ADD S8, S6, T0 | |||
PTR_ADD TS, S7, T0 | |||
beq I, ZERO, .L_N8_M7 | |||
.align 5 | |||
.L_N8_M8: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvld U4, S5, 0x00 | |||
xvld U5, S6, 0x00 | |||
xvld U6, S7, 0x00 | |||
xvld U7, S8, 0x00 | |||
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ | |||
U0, U1, U2, U3, U4, U5, U6, U7, \ | |||
D1, D3, D5, D7 // As tmp | |||
GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ | |||
D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 | |||
PTR_ADDI TD, TD, 0x100 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI S2, S2, 0x20 | |||
PTR_ADDI S3, S3, 0x20 | |||
PTR_ADDI S4, S4, 0x20 | |||
PTR_ADDI S5, S5, 0x20 | |||
PTR_ADDI S6, S6, 0x20 | |||
PTR_ADDI S7, S7, 0x20 | |||
PTR_ADDI S8, S8, 0x20 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_N8_M8 | |||
.L_N8_M7: | |||
andi I, M, 0x07 | |||
beq I, ZERO, .L_N7 | |||
.align 5 | |||
.L_N8_M1: | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S2, 0x00 | |||
fld.s F2, S3, 0x00 | |||
fld.s F3, S4, 0x00 | |||
fld.s F4, S5, 0x00 | |||
fld.s F5, S6, 0x00 | |||
fld.s F6, S7, 0x00 | |||
fld.s F7, S8, 0x00 | |||
fst.s F0, TD, 0x00 | |||
PTR_ADDI S1, S1, 0x04 | |||
fst.s F1, TD, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
fst.s F2, TD, 0x08 | |||
PTR_ADDI S3, S3, 0x04 | |||
fst.s F3, TD, 0x0C | |||
PTR_ADDI S4, S4, 0x04 | |||
fst.s F4, TD, 0x10 | |||
PTR_ADDI S5, S5, 0x04 | |||
fst.s F5, TD, 0x14 | |||
PTR_ADDI S6, S6, 0x04 | |||
fst.s F6, TD, 0x18 | |||
PTR_ADDI S7, S7, 0x04 | |||
fst.s F7, TD, 0x1C | |||
PTR_ADDI S8, S8, 0x04 | |||
PTR_ADDI TD, TD, 0x20 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_N8_M1 | |||
.L_N7: | |||
andi J, N, 0x07 | |||
beq ZERO, J, .L_N0 | |||
andi J, N, 0x04 | |||
beq ZERO, J, .L_N3 | |||
.L_N4: | |||
move S1, TS | |||
PTR_ADD S2, TS, TL | |||
PTR_SRAI I, M, 0x02 | |||
PTR_ADD S3, S2, TL | |||
PTR_ADD S4, S2, T0 | |||
PTR_ADD TS, S3, T0 | |||
beq I, ZERO, .L_N4_M3 | |||
.align 5 | |||
.L_N4_M4: | |||
GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 | |||
GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 | |||
GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 | |||
GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 | |||
GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 | |||
GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 | |||
PTR_ADDI S1, S1, 0x10 | |||
PTR_ADDI S2, S2, 0x10 | |||
PTR_ADDI S3, S3, 0x10 | |||
PTR_ADDI S4, S4, 0x10 | |||
PTR_ADDI TD, TD, 0x40 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_N4_M4 | |||
.L_N4_M3: | |||
andi I, M, 0x03 | |||
beq I, ZERO, .L_N3 | |||
.align 5 | |||
.L_N4_M1: | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S2, 0x00 | |||
fld.s F2, S3, 0x00 | |||
fld.s F3, S4, 0x00 | |||
fst.s F0, TD, 0x00 | |||
PTR_ADDI S1, S1, 0x04 | |||
fst.s F1, TD, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
fst.s F2, TD, 0x08 | |||
PTR_ADDI S3, S3, 0x04 | |||
fst.s F3, TD, 0x0C | |||
PTR_ADDI S4, S4, 0x04 | |||
PTR_ADDI TD, TD, 0x10 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_N4_M1 | |||
.L_N3: | |||
andi J, N, 0x03 | |||
beq ZERO, J, .L_N0 | |||
andi J, N, 0x02 | |||
beq ZERO, J, .L_N1 | |||
.L_N2: | |||
move S1, TS | |||
PTR_ADD S2, TS, TL | |||
PTR_SRAI I, M, 0x01 | |||
PTR_ADD TS, S2, TL | |||
beq I, ZERO, .L_N2_M1 | |||
.align 5 | |||
.L_N2_M2: | |||
GLD f, d, F0, S1, 0x00, F1, S2, 0x00 | |||
vilvl.w $vr0, $vr1, $vr0 | |||
GST v, , $vr0, TD, 0x00 | |||
PTR_ADDI S1, S1, 0x08 | |||
PTR_ADDI S2, S2, 0x08 | |||
PTR_ADDI TD, TD, 0x10 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_N2_M2 | |||
.L_N2_M1: | |||
andi I, M, 0x01 | |||
beq I, ZERO, .L_N1 | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S2, 0x00 | |||
fst.s F0, TD, 0x00 | |||
PTR_ADDI S1, S1, 0x04 | |||
fst.s F1, TD, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
PTR_ADDI TD, TD, 0x08 | |||
.align 5 | |||
.L_N1: | |||
move S1, TS | |||
beq ZERO, M, .L_N0 | |||
.L_N1_M1: | |||
fld.s F0, S1, 0x00 | |||
PTR_ADDI S1, S1, 0x04 | |||
fst.s F0, TD, 0x00 | |||
PTR_ADDI TD, TD, 0x04 | |||
PTR_ADDI M, M, -1 | |||
blt ZERO, M, .L_N1_M1 | |||
.L_N0: | |||
pop_if_used 26, 32 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE |
@@ -0,0 +1,298 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2023, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/********************************************************************* | |||
* 2023/08/23 guxiwei | |||
* UTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
*********************************************************************/ | |||
/* Function parameters */ | |||
#define M $r4 // param 1: m | |||
#define N $r5 // param 2: n | |||
#define SRC $r6 // param 3: src | |||
#define LDA $r7 // param 4: lda | |||
#define DST $r8 // param 5: dst | |||
#define I $r9 | |||
#define J $r10 | |||
#define S1 $r12 | |||
#define S2 $r13 | |||
#define S3 $r14 | |||
#define S4 $r15 | |||
#define S5 $r16 | |||
#define S6 $r17 | |||
#define S7 $r18 | |||
#define S8 $r19 | |||
#define TD $r20 | |||
#define TS $r11 | |||
#define TL $r7 | |||
#define T0 $r6 | |||
#undef ZERO | |||
#define ZERO $r0 | |||
#define F0 $f0 | |||
#define F1 $f1 | |||
#define F2 $f2 | |||
#define F3 $f3 | |||
#define F4 $f4 | |||
#define F5 $f5 | |||
#define F6 $f6 | |||
#define F7 $f7 | |||
/* LASX vectors */ | |||
#define U0 $xr0 | |||
#define U1 $xr1 | |||
#define U2 $xr2 | |||
#define U3 $xr3 | |||
#define U4 $xr4 | |||
#define U5 $xr5 | |||
#define U6 $xr6 | |||
#define U7 $xr7 | |||
#define D0 $xr8 | |||
#define D1 $xr9 | |||
#define D2 $xr10 | |||
#define D3 $xr11 | |||
#define D4 $xr12 | |||
#define D5 $xr13 | |||
#define D6 $xr14 | |||
#define D7 $xr15 | |||
#define D8 $xr16 | |||
#define D10 $xr17 | |||
#define D12 $xr18 | |||
#define D14 $xr19 | |||
// Loops outline | |||
//.L_N8: <---------------- | |||
//| .L_M8: | | |||
//| .L_M7: | Main Loop | |||
//| .L_M1: | | |||
//| .L_M0:-------------- | |||
//.L_N7: | |||
//.L_N4: | |||
//| .L_N4_M4: | |||
//| .L_N4_M3: | |||
//| .L_N4_M1: | |||
//.L_N3: | |||
//.L_N2: | |||
//| .L_N2_M2: | |||
//| .L_N2_M1: | |||
//.L_N1: | |||
//| .L_N1_M1: | |||
//.L_N0 | |||
PROLOGUE | |||
push_if_used 17, 20 | |||
move TD, DST | |||
move TS, SRC | |||
PTR_SLLI TL, LDA, 0x02 | |||
PTR_SLLI T0, TL, 0x01 | |||
PTR_SRAI J, N, 0x03 | |||
beq J, ZERO, .L_N7 | |||
.align 5 | |||
.L_N8: | |||
move S1, TS | |||
PTR_ADD S2, TS, TL | |||
PTR_SRAI I, M, 0x03 | |||
PTR_ADD S3, S2, TL | |||
PTR_ADDI J, J, -1 | |||
PTR_ADD S4, S2, T0 | |||
PTR_ADD S5, S3, T0 | |||
PTR_ADD S6, S4, T0 | |||
PTR_ADD S7, S5, T0 | |||
PTR_ADD S8, S6, T0 | |||
PTR_ADD TS, S7, T0 | |||
beq I, ZERO, .L_M7 | |||
.align 5 | |||
.L_M8: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvld U4, S5, 0x00 | |||
xvld U5, S6, 0x00 | |||
xvld U6, S7, 0x00 | |||
xvld U7, S8, 0x00 | |||
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ | |||
U0, U1, U2, U3, U4, U5, U6, U7, \ | |||
D1, D3, D5, D7 // As tmp | |||
GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ | |||
D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 | |||
PTR_ADDI TD, TD, 0x100 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI S2, S2, 0x20 | |||
PTR_ADDI S3, S3, 0x20 | |||
PTR_ADDI S4, S4, 0x20 | |||
PTR_ADDI S5, S5, 0x20 | |||
PTR_ADDI S6, S6, 0x20 | |||
PTR_ADDI S7, S7, 0x20 | |||
PTR_ADDI S8, S8, 0x20 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_M8 | |||
.L_M7: | |||
andi I, M, 0x07 | |||
beq I, ZERO, .L_M0 | |||
.align 5 | |||
.L_M1: | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S2, 0x00 | |||
fld.s F2, S3, 0x00 | |||
fld.s F3, S4, 0x00 | |||
fld.s F4, S5, 0x00 | |||
fld.s F5, S6, 0x00 | |||
fld.s F6, S7, 0x00 | |||
fld.s F7, S8, 0x00 | |||
fst.s F0, TD, 0x00 | |||
PTR_ADDI S1, S1, 0x04 | |||
fst.s F1, TD, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
fst.s F2, TD, 0x08 | |||
PTR_ADDI S3, S3, 0x04 | |||
fst.s F3, TD, 0x0C | |||
PTR_ADDI S4, S4, 0x04 | |||
fst.s F4, TD, 0x10 | |||
PTR_ADDI S5, S5, 0x04 | |||
fst.s F5, TD, 0x14 | |||
PTR_ADDI S6, S6, 0x04 | |||
fst.s F6, TD, 0x18 | |||
PTR_ADDI S7, S7, 0x04 | |||
fst.s F7, TD, 0x1C | |||
PTR_ADDI S8, S8, 0x04 | |||
PTR_ADDI TD, TD, 0x20 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_M1 | |||
.L_M0: | |||
blt ZERO, J, .L_N8 | |||
.L_N7: | |||
andi J, N, 0x07 | |||
beq ZERO, J, .L_N0 | |||
andi J, N, 0x04 | |||
beq ZERO, J, .L_N3 | |||
.L_N4: | |||
move S1, TS | |||
PTR_ADD S2, TS, TL | |||
PTR_SRAI I, M, 0x02 | |||
PTR_ADD S3, S2, TL | |||
PTR_ADD S4, S2, T0 | |||
PTR_ADD TS, S3, T0 | |||
beq I, ZERO, .L_N4_M3 | |||
.align 5 | |||
.L_N4_M4: | |||
GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 | |||
GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 | |||
GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 | |||
GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 | |||
GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 | |||
GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 | |||
PTR_ADDI S1, S1, 0x10 | |||
PTR_ADDI S2, S2, 0x10 | |||
PTR_ADDI S3, S3, 0x10 | |||
PTR_ADDI S4, S4, 0x10 | |||
PTR_ADDI TD, TD, 0x40 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_N4_M4 | |||
.L_N4_M3: | |||
andi I, M, 0x03 | |||
beq I, ZERO, .L_N3 | |||
.align 5 | |||
.L_N4_M1: | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S2, 0x00 | |||
fld.s F2, S3, 0x00 | |||
fld.s F3, S4, 0x00 | |||
fst.s F0, TD, 0x00 | |||
PTR_ADDI S1, S1, 0x04 | |||
fst.s F1, TD, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
fst.s F2, TD, 0x08 | |||
PTR_ADDI S3, S3, 0x04 | |||
fst.s F3, TD, 0x0C | |||
PTR_ADDI S4, S4, 0x04 | |||
PTR_ADDI TD, TD, 0x10 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_N4_M1 | |||
.L_N3: | |||
andi J, N, 0x03 | |||
beq ZERO, J, .L_N0 | |||
andi J, N, 0x02 | |||
beq ZERO, J, .L_N1 | |||
.L_N2: | |||
move S1, TS | |||
PTR_ADD S2, TS, TL | |||
PTR_SRAI I, M, 0x01 | |||
PTR_ADD TS, S2, TL | |||
beq I, ZERO, .L_N2_M1 | |||
.align 5 | |||
.L_N2_M2: | |||
GLD f, d, F0, S1, 0x00, F1, S2, 0x00 | |||
vilvl.w $vr0, $vr1, $vr0 | |||
GST v, , $vr0, TD, 0x00 | |||
PTR_ADDI S1, S1, 0x08 | |||
PTR_ADDI S2, S2, 0x08 | |||
PTR_ADDI TD, TD, 0x10 | |||
PTR_ADDI I, I, -1 | |||
blt ZERO, I, .L_N2_M2 | |||
.L_N2_M1: | |||
andi I, M, 0x01 | |||
beq I, ZERO, .L_N1 | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S2, 0x00 | |||
fst.s F0, TD, 0x00 | |||
PTR_ADDI S1, S1, 0x04 | |||
fst.s F1, TD, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
PTR_ADDI TD, TD, 0x08 | |||
.align 5 | |||
.L_N1: | |||
move S1, TS | |||
beq ZERO, M, .L_N0 | |||
.L_N1_M1: | |||
fld.s F0, S1, 0x00 | |||
PTR_ADDI S1, S1, 0x04 | |||
fst.s F0, TD, 0x00 | |||
PTR_ADDI TD, TD, 0x04 | |||
PTR_ADDI M, M, -1 | |||
blt ZERO, M, .L_N1_M1 | |||
.L_N0: | |||
pop_if_used 17, 20 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE |
@@ -0,0 +1,526 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2023, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/********************************************************************* | |||
* 2023/08/23 guxiwei | |||
* UTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
*********************************************************************/ | |||
/* Function parameters */ | |||
#define M $r4 // param 1: m | |||
#define N $r5 // param 2: n | |||
#define SRC $r6 // param 3: src | |||
#define LDA $r7 // param 4: lda | |||
#define DST $r8 // param 5: dst | |||
#define I $r9 | |||
#define J $r10 | |||
#define S0 $r11 | |||
#define S1 $r12 | |||
#define S2 $r13 | |||
#define S3 $r14 | |||
#define S4 $r15 | |||
#define S5 $r16 | |||
#define S6 $r17 | |||
#define S7 $r18 | |||
#define S8 $r19 | |||
#define P0 $r20 | |||
#define P1 $r23 | |||
#define P2 $r24 | |||
#define P3 $r25 | |||
#define P4 $r26 | |||
#define P5 $r27 | |||
#define T0 $r28 | |||
#define T1 $r29 | |||
#define TL $r7 | |||
#define ZERO $r0 | |||
/* LASX vectors */ | |||
#define U0 $xr0 | |||
#define U1 $xr1 | |||
#define U2 $xr2 | |||
#define U3 $xr3 | |||
#define U4 $xr4 | |||
#define U5 $xr5 | |||
#define U6 $xr6 | |||
#define U7 $xr7 | |||
// Loops outline | |||
//.L_M8 <------------------- | |||
//| .L_N16: | | |||
//| .L_N15: | | |||
//| .L_N8: | | |||
//| .L_N7: | Main Loop | |||
//| .L_N4: | | |||
//| .L_N3: | | |||
//| .L_N2: | | |||
//| .L_N1: | | |||
//| .L_N0: --------------- | |||
//.L_M7 | |||
//.L_M4 | |||
//| .L_M4_N16: | |||
//| .L_M4_N15: | |||
//| .L_M4_N8: | |||
//| .L_M4_N7: | |||
//| .L_M4_N4: | |||
//| .L_M4_N3: | |||
//| .L_M4_N2: | |||
//| .L_M4_N1: | |||
//.L_M3 | |||
//.L_M2 | |||
//| .L_M2_N16: | |||
//| .L_M2_N15: | |||
//| .L_M2_N8: | |||
//| .L_M2_N7: | |||
//| .L_M2_N4: | |||
//| .L_M2_N3: | |||
//| .L_M2_N2: | |||
//| .L_M2_N1: | |||
//.L_M1 | |||
//| .L_M1_N16: | |||
//| .L_M1_N15: | |||
//| .L_M1_N8: | |||
//| .L_M1_N7: | |||
//| .L_M1_N4: | |||
//| .L_M1_N3: | |||
//| .L_M1_N2: | |||
//| .L_M1_N1: | |||
//.L_M0 | |||
PROLOGUE | |||
push_if_used 24, 8 | |||
move S0, SRC | |||
move P0, DST | |||
PTR_SRAI T0, N, 0x04 | |||
PTR_SRAI T1, N, 0x03 | |||
PTR_SLLI T0, T0, 0x04 | |||
PTR_SLLI T1, T1, 0x03 | |||
PTR_MUL P2, M, T0 | |||
PTR_MUL P3, M, T1 | |||
PTR_SLLI P2, P2, 0x02 | |||
PTR_SLLI P3, P3, 0x02 | |||
PTR_ADD P2, DST, P2 | |||
PTR_ADD P3, DST, P3 | |||
PTR_SRAI T0, N, 0x02 | |||
PTR_SRAI T1, N, 0x01 | |||
PTR_SLLI T0, T0, 0x02 | |||
PTR_SLLI T1, T1, 0x01 | |||
PTR_MUL P4, M, T0 | |||
PTR_MUL P5, M, T1 | |||
PTR_SLLI P4, P4, 0x02 | |||
PTR_SLLI P5, P5, 0x02 | |||
PTR_ADD P4, DST, P4 | |||
PTR_ADD P5, DST, P5 | |||
PTR_SLLI TL, LDA, 0x02 | |||
PTR_SRAI J, M, 0x03 | |||
PTR_SLLI T0, TL, 0x01 | |||
PTR_SLLI T1, M, 0x06 | |||
beq ZERO, J, .L_M7 | |||
.align 5 | |||
.L_M8: | |||
move S1, S0 | |||
PTR_ADD S2, S0, TL | |||
PTR_ADD S3, S1, T0 | |||
PTR_ADD S4, S2, T0 | |||
PTR_ADD S5, S3, T0 | |||
PTR_ADD S6, S4, T0 | |||
PTR_ADD S7, S5, T0 | |||
PTR_ADD S8, S6, T0 | |||
PTR_ADD S0, S7, T0 | |||
move P1, P0 | |||
PTR_ADDI P0, P0, 0x200 | |||
PTR_SRAI I, N, 0x04 | |||
PTR_ADDI J, J, -1 | |||
beq ZERO, I, .L_N15 | |||
.L_N16: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvld U2, S2, 0x00 | |||
xvld U3, S2, 0x20 | |||
xvst U0, P1, 0x00 | |||
xvst U1, P1, 0x20 | |||
xvst U2, P1, 0x40 | |||
xvst U3, P1, 0x60 | |||
xvld U4, S3, 0x00 | |||
xvld U5, S3, 0x20 | |||
xvld U6, S4, 0x00 | |||
xvld U7, S4, 0x20 | |||
xvst U4, P1, 0x80 | |||
xvst U5, P1, 0xA0 | |||
xvst U6, P1, 0xC0 | |||
xvst U7, P1, 0xE0 | |||
xvld U0, S5, 0x00 | |||
xvld U1, S5, 0x20 | |||
xvld U2, S6, 0x00 | |||
xvld U3, S6, 0x20 | |||
xvst U0, P1, 0x100 | |||
xvst U1, P1, 0x120 | |||
xvst U2, P1, 0x140 | |||
xvst U3, P1, 0x160 | |||
xvld U4, S7, 0x00 | |||
xvld U5, S7, 0x20 | |||
xvld U6, S8, 0x00 | |||
xvld U7, S8, 0x20 | |||
xvst U4, P1, 0x180 | |||
xvst U5, P1, 0x1A0 | |||
xvst U6, P1, 0x1C0 | |||
xvst U7, P1, 0x1E0 | |||
PTR_ADDI S1, S1, 0x40 | |||
PTR_ADDI S2, S2, 0x40 | |||
PTR_ADDI S3, S3, 0x40 | |||
PTR_ADDI S4, S4, 0x40 | |||
PTR_ADDI S5, S5, 0x40 | |||
PTR_ADDI S6, S6, 0x40 | |||
PTR_ADDI S7, S7, 0x40 | |||
PTR_ADDI S8, S8, 0x40 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD P1, P1, T1 | |||
blt ZERO, I, .L_N16 | |||
.L_N15: | |||
andi I, N, 0x08 | |||
beq ZERO, I, .L_N7 | |||
.L_N8: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvld U4, S5, 0x00 | |||
xvld U5, S6, 0x00 | |||
xvld U6, S7, 0x00 | |||
xvld U7, S8, 0x00 | |||
GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \ | |||
U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI S2, S2, 0x20 | |||
PTR_ADDI S3, S3, 0x20 | |||
PTR_ADDI S4, S4, 0x20 | |||
PTR_ADDI S5, S5, 0x20 | |||
PTR_ADDI S6, S6, 0x20 | |||
PTR_ADDI S7, S7, 0x20 | |||
PTR_ADDI S8, S8, 0x20 | |||
PTR_ADDI P2, P2, 0x100 | |||
.L_N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_N3 | |||
.L_N4: | |||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ | |||
$vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 | |||
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \ | |||
$vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70 | |||
PTR_ADDI S1, S1, 0x10 | |||
PTR_ADDI S2, S2, 0x10 | |||
PTR_ADDI S3, S3, 0x10 | |||
PTR_ADDI S4, S4, 0x10 | |||
PTR_ADDI S5, S5, 0x10 | |||
PTR_ADDI S6, S6, 0x10 | |||
PTR_ADDI S7, S7, 0x10 | |||
PTR_ADDI S8, S8, 0x10 | |||
PTR_ADDI P3, P3, 0x80 | |||
.L_N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_N1 | |||
.L_N2: | |||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \ | |||
$f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38 | |||
PTR_ADDI S1, S1, 0x08 | |||
PTR_ADDI S2, S2, 0x08 | |||
PTR_ADDI S3, S3, 0x08 | |||
PTR_ADDI S4, S4, 0x08 | |||
PTR_ADDI S5, S5, 0x08 | |||
PTR_ADDI S6, S6, 0x08 | |||
PTR_ADDI S7, S7, 0x08 | |||
PTR_ADDI S8, S8, 0x08 | |||
PTR_ADDI P4, P4, 0x40 | |||
.L_N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_N0 | |||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \ | |||
$f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C | |||
PTR_ADDI S1, S1, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
PTR_ADDI S3, S3, 0x04 | |||
PTR_ADDI S4, S4, 0x04 | |||
PTR_ADDI S5, S5, 0x04 | |||
PTR_ADDI S6, S6, 0x04 | |||
PTR_ADDI S7, S7, 0x04 | |||
PTR_ADDI S8, S8, 0x04 | |||
PTR_ADDI P5, P5, 0x20 | |||
.L_N0: | |||
blt ZERO, J, .L_M8 | |||
.L_M7: | |||
andi J, M, 0x04 | |||
beq ZERO, J, .L_M3 | |||
.L_M4: | |||
move S1, S0 | |||
PTR_ADD S2, S0, TL | |||
PTR_ADD S3, S1, T0 | |||
PTR_ADD S4, S2, T0 | |||
PTR_ADD S0, S3, T0 | |||
move P1, P0 | |||
PTR_ADDI P0, P0, 0x100 | |||
PTR_SRAI I, N, 0x04 | |||
beq ZERO, I, .L_M4_N15 | |||
.align 5 | |||
.L_M4_N16: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvld U2, S2, 0x00 | |||
xvld U3, S2, 0x20 | |||
xvst U0, P1, 0x00 | |||
xvst U1, P1, 0x20 | |||
xvst U2, P1, 0x40 | |||
xvst U3, P1, 0x60 | |||
xvld U4, S3, 0x00 | |||
xvld U5, S3, 0x20 | |||
xvld U6, S4, 0x00 | |||
xvld U7, S4, 0x20 | |||
xvst U4, P1, 0x80 | |||
xvst U5, P1, 0xA0 | |||
xvst U6, P1, 0xC0 | |||
xvst U7, P1, 0xE0 | |||
PTR_ADDI S1, S1, 0x40 | |||
PTR_ADDI S2, S2, 0x40 | |||
PTR_ADDI S3, S3, 0x40 | |||
PTR_ADDI S4, S4, 0x40 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD P1, P1, T1 | |||
blt ZERO, I, .L_M4_N16 | |||
.L_M4_N15: | |||
andi I, N, 0x08 | |||
beq ZERO, I, .L_M4_N7 | |||
.L_M4_N8: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI S2, S2, 0x20 | |||
PTR_ADDI S3, S3, 0x20 | |||
PTR_ADDI S4, S4, 0x20 | |||
PTR_ADDI P2, P2, 0x80 | |||
.L_M4_N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_M4_N3 | |||
.L_M4_N4: | |||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 | |||
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30 | |||
PTR_ADDI S1, S1, 0x10 | |||
PTR_ADDI S2, S2, 0x10 | |||
PTR_ADDI S3, S3, 0x10 | |||
PTR_ADDI S4, S4, 0x10 | |||
PTR_ADDI P3, P3, 0x40 | |||
.L_M4_N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_M4_N1 | |||
.L_M4_N2: | |||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18 | |||
PTR_ADDI S1, S1, 0x08 | |||
PTR_ADDI S2, S2, 0x08 | |||
PTR_ADDI S3, S3, 0x08 | |||
PTR_ADDI S4, S4, 0x08 | |||
PTR_ADDI P4, P4, 0x20 | |||
.L_M4_N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M3 | |||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C | |||
PTR_ADDI S1, S1, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
PTR_ADDI S3, S3, 0x04 | |||
PTR_ADDI S4, S4, 0x04 | |||
PTR_ADDI P5, P5, 0x10 | |||
.L_M3: | |||
andi J, M, 0x02 | |||
beq ZERO, J, .L_M1 | |||
.L_M2: | |||
move S1, S0 | |||
PTR_ADD S2, S0, TL | |||
PTR_ADD S0, S0, T0 | |||
move P1, P0 | |||
PTR_ADDI P0, P0, 0x80 | |||
PTR_SRAI I, N, 0x04 | |||
beq ZERO, I, .L_M2_N15 | |||
.align 5 | |||
.L_M2_N16: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvld U2, S2, 0x00 | |||
xvld U3, S2, 0x20 | |||
xvst U0, P1, 0x00 | |||
xvst U1, P1, 0x20 | |||
xvst U2, P1, 0x40 | |||
xvst U3, P1, 0x60 | |||
PTR_ADDI S1, S1, 0x40 | |||
PTR_ADDI S2, S2, 0x40 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD P1, P1, T1 | |||
blt ZERO, I, .L_M2_N16 | |||
.L_M2_N15: | |||
andi I, N, 0x08 | |||
beq ZERO, I, .L_M2_N7 | |||
.L_M2_N8: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
GST xv, , U0, P2, 0x00, U1, P2, 0x20 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI S2, S2, 0x20 | |||
PTR_ADDI P2, P2, 0x40 | |||
.L_M2_N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_M2_N3 | |||
.L_M2_N4: | |||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 | |||
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10 | |||
PTR_ADDI S1, S1, 0x10 | |||
PTR_ADDI S2, S2, 0x10 | |||
PTR_ADDI P3, P3, 0x20 | |||
.L_M2_N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_M2_N1 | |||
.L_M2_N2: | |||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 | |||
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08 | |||
PTR_ADDI S1, S1, 0x08 | |||
PTR_ADDI S2, S2, 0x08 | |||
PTR_ADDI P4, P4, 0x10 | |||
.L_M2_N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M1 | |||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 | |||
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04 | |||
PTR_ADDI S1, S1, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
PTR_ADDI P5, P5, 0x08 | |||
.L_M1: | |||
andi J, M, 0x01 | |||
beq ZERO, J, .L_M0 | |||
move S1, S0 | |||
PTR_ADD S2, S0, TL | |||
move P1, P0 | |||
PTR_ADDI P0, P0, 0x40 | |||
PTR_SRAI I, N, 0x04 | |||
beq ZERO, I, .L_M1_N15 | |||
.align 5 | |||
.L_M1_N16: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvst U0, P1, 0x00 | |||
xvst U1, P1, 0x20 | |||
PTR_ADDI S1, S1, 0x40 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD P1, P1, T1 | |||
blt ZERO, I, .L_M1_N16 | |||
.L_M1_N15: | |||
andi I, N, 0x08 | |||
beq ZERO, I, .L_M1_N7 | |||
.L_M1_N8: | |||
xvld U0, S1, 0x00 | |||
GST xv, , U0, P2, 0x00 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI P2, P2, 0x20 | |||
.L_M1_N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_M1_N3 | |||
.L_M1_N4: | |||
GLD v, , $vr0, S1, 0x00 | |||
GST v, , $vr0, P3, 0x00 | |||
PTR_ADDI S1, S1, 0x10 | |||
PTR_ADDI P3, P3, 0x10 | |||
.L_M1_N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_M1_N1 | |||
.L_M1_N2: | |||
GLD f, d, $f0, S1, 0x00 | |||
GST f, d, $f0, P4, 0x00 | |||
PTR_ADDI S1, S1, 0x08 | |||
PTR_ADDI P4, P4, 0x08 | |||
.L_M1_N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M0 | |||
GLD f, s, $f0, S1, 0x00 | |||
GST f, s, $f0, P5, 0x00 | |||
PTR_ADDI S1, S1, 0x04 | |||
PTR_ADDI P5, P5, 0x04 | |||
.L_M0: | |||
pop_if_used 24, 8 | |||
jirl $r0, $r1, 0x00 | |||
EPILOGUE |
@@ -0,0 +1,406 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2023, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/********************************************************************* | |||
* 2023/08/23 guxiwei | |||
* UTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
*********************************************************************/ | |||
/* Function parameters */ | |||
#define M $r4 // param 1: m | |||
#define N $r5 // param 2: n | |||
#define SRC $r6 // param 3: src | |||
#define LDA $r7 // param 4: lda | |||
#define DST $r8 // param 5: dst | |||
#define I $r9 | |||
#define J $r10 | |||
#define S0 $r11 | |||
#define S1 $r12 | |||
#define S2 $r13 | |||
#define S3 $r14 | |||
#define S4 $r15 | |||
#define S5 $r16 | |||
#define S6 $r17 | |||
#define S7 $r18 | |||
#define S8 $r19 | |||
#define P0 $r20 | |||
#define P1 $r23 | |||
#define P2 $r24 | |||
#define P3 $r25 | |||
#define P4 $r26 | |||
#define T0 $r27 | |||
#define T1 $r28 | |||
#define TL $r7 | |||
#undef ZERO | |||
#define ZERO $r0 | |||
/* LASX vectors */ | |||
#define U0 $xr0 | |||
#define U1 $xr1 | |||
#define U2 $xr2 | |||
#define U3 $xr3 | |||
#define U4 $xr4 | |||
#define U5 $xr5 | |||
#define U6 $xr6 | |||
#define U7 $xr7 | |||
// Loops outline | |||
//.L_M8 <------------------- | |||
//| .L_N8: | | |||
//| .L_N7: | Main Loop | |||
//| .L_N4: | | |||
//| .L_N3: | | |||
//| .L_N2: | | |||
//| .L_N1: | | |||
//| .L_N0: --------------- | |||
//.L_M7 | |||
//.L_M4 | |||
//| .L_M4_N8: | |||
//| .L_M4_N7: | |||
//| .L_M4_N4: | |||
//| .L_M4_N3: | |||
//| .L_M4_N2: | |||
//| .L_M4_N1: | |||
//.L_M3 | |||
//.L_M2 | |||
//| .L_M2_N8: | |||
//| .L_M2_N7: | |||
//| .L_M2_N4: | |||
//| .L_M2_N3: | |||
//| .L_M2_N2: | |||
//| .L_M2_N1: | |||
//.L_M1 | |||
//| .L_M1_N8: | |||
//| .L_M1_N7: | |||
//| .L_M1_N4: | |||
//| .L_M1_N3: | |||
//| .L_M1_N2: | |||
//| .L_M1_N1: | |||
//.L_M0 | |||
PROLOGUE | |||
push_if_used 23, 8 | |||
move S0, SRC | |||
move P0, DST | |||
PTR_SRAI T0, N, 0x04 | |||
PTR_SRAI T1, N, 0x03 | |||
PTR_SLLI T0, T0, 0x04 | |||
PTR_SLLI T1, T1, 0x03 | |||
PTR_MUL P2, M, T1 | |||
PTR_SLLI P2, P2, 0x02 | |||
PTR_ADD P2, DST, P2 | |||
PTR_SRAI T0, N, 0x02 | |||
PTR_SRAI T1, N, 0x01 | |||
PTR_SLLI T0, T0, 0x02 | |||
PTR_SLLI T1, T1, 0x01 | |||
PTR_MUL P3, M, T0 | |||
PTR_MUL P4, M, T1 | |||
PTR_SLLI P3, P3, 0x02 | |||
PTR_SLLI P4, P4, 0x02 | |||
PTR_ADD P3, DST, P3 | |||
PTR_ADD P4, DST, P4 | |||
PTR_SLLI TL, LDA, 0x02 | |||
PTR_SRAI J, M, 0x03 | |||
PTR_SLLI T0, TL, 0x01 | |||
PTR_SLLI T1, M, 0x05 | |||
beq ZERO, J, .L_M7 | |||
.align 5 | |||
.L_M8: | |||
move S1, S0 | |||
PTR_ADD S2, S0, TL | |||
PTR_ADD S3, S1, T0 | |||
PTR_ADD S4, S2, T0 | |||
PTR_ADD S5, S3, T0 | |||
PTR_ADD S6, S4, T0 | |||
PTR_ADD S7, S5, T0 | |||
PTR_ADD S8, S6, T0 | |||
PTR_ADD S0, S7, T0 | |||
move P1, P0 | |||
PTR_ADDI P0, P0, 0x100 | |||
PTR_SRAI I, N, 0x03 | |||
PTR_ADDI J, J, -1 | |||
beq ZERO, I, .L_N7 | |||
.L_N8: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvld U4, S5, 0x00 | |||
xvld U5, S6, 0x00 | |||
xvld U6, S7, 0x00 | |||
xvld U7, S8, 0x00 | |||
GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \ | |||
U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI S2, S2, 0x20 | |||
PTR_ADDI S3, S3, 0x20 | |||
PTR_ADDI S4, S4, 0x20 | |||
PTR_ADDI S5, S5, 0x20 | |||
PTR_ADDI S6, S6, 0x20 | |||
PTR_ADDI S7, S7, 0x20 | |||
PTR_ADDI S8, S8, 0x20 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD P1, P1, T1 | |||
blt ZERO, I, .L_N8 | |||
.L_N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_N3 | |||
.L_N4: | |||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ | |||
$vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 | |||
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \ | |||
$vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70 | |||
PTR_ADDI S1, S1, 0x10 | |||
PTR_ADDI S2, S2, 0x10 | |||
PTR_ADDI S3, S3, 0x10 | |||
PTR_ADDI S4, S4, 0x10 | |||
PTR_ADDI S5, S5, 0x10 | |||
PTR_ADDI S6, S6, 0x10 | |||
PTR_ADDI S7, S7, 0x10 | |||
PTR_ADDI S8, S8, 0x10 | |||
PTR_ADDI P2, P2, 0x80 | |||
.L_N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_N1 | |||
.L_N2: | |||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \ | |||
$f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38 | |||
PTR_ADDI S1, S1, 0x08 | |||
PTR_ADDI S2, S2, 0x08 | |||
PTR_ADDI S3, S3, 0x08 | |||
PTR_ADDI S4, S4, 0x08 | |||
PTR_ADDI S5, S5, 0x08 | |||
PTR_ADDI S6, S6, 0x08 | |||
PTR_ADDI S7, S7, 0x08 | |||
PTR_ADDI S8, S8, 0x08 | |||
PTR_ADDI P3, P3, 0x40 | |||
.L_N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_N0 | |||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \ | |||
$f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C | |||
PTR_ADDI S1, S1, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
PTR_ADDI S3, S3, 0x04 | |||
PTR_ADDI S4, S4, 0x04 | |||
PTR_ADDI S5, S5, 0x04 | |||
PTR_ADDI S6, S6, 0x04 | |||
PTR_ADDI S7, S7, 0x04 | |||
PTR_ADDI S8, S8, 0x04 | |||
PTR_ADDI P4, P4, 0x20 | |||
.L_N0: | |||
blt ZERO, J, .L_M8 | |||
.L_M7: | |||
andi J, M, 0x04 | |||
beq ZERO, J, .L_M3 | |||
.L_M4: | |||
move S1, S0 | |||
PTR_ADD S2, S0, TL | |||
PTR_ADD S3, S1, T0 | |||
PTR_ADD S4, S2, T0 | |||
PTR_ADD S0, S3, T0 | |||
move P1, P0 | |||
PTR_ADDI P0, P0, 0x80 | |||
PTR_SRAI I, N, 0x03 | |||
beq ZERO, I, .L_M4_N7 | |||
.align 5 | |||
.L_M4_N8: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI S2, S2, 0x20 | |||
PTR_ADDI S3, S3, 0x20 | |||
PTR_ADDI S4, S4, 0x20 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD P1, P1, T1 | |||
blt ZERO, I, .L_M4_N8 | |||
.L_M4_N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_M4_N3 | |||
.L_M4_N4: | |||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 | |||
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30 | |||
PTR_ADDI S1, S1, 0x10 | |||
PTR_ADDI S2, S2, 0x10 | |||
PTR_ADDI S3, S3, 0x10 | |||
PTR_ADDI S4, S4, 0x10 | |||
PTR_ADDI P2, P2, 0x40 | |||
.L_M4_N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_M4_N1 | |||
.L_M4_N2: | |||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18 | |||
PTR_ADDI S1, S1, 0x08 | |||
PTR_ADDI S2, S2, 0x08 | |||
PTR_ADDI S3, S3, 0x08 | |||
PTR_ADDI S4, S4, 0x08 | |||
PTR_ADDI P3, P3, 0x20 | |||
.L_M4_N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M3 | |||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C | |||
PTR_ADDI S1, S1, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
PTR_ADDI S3, S3, 0x04 | |||
PTR_ADDI S4, S4, 0x04 | |||
PTR_ADDI P4, P4, 0x10 | |||
.L_M3: | |||
andi J, M, 0x02 | |||
beq ZERO, J, .L_M1 | |||
.L_M2: | |||
move S1, S0 | |||
PTR_ADD S2, S0, TL | |||
PTR_ADD S0, S0, T0 | |||
move P1, P0 | |||
PTR_ADDI P0, P0, 0x40 | |||
PTR_SRAI I, N, 0x03 | |||
beq ZERO, I, .L_M2_N7 | |||
.align 5 | |||
.L_M2_N8: | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
GST xv, , U0, P1, 0x00, U1, P1, 0x20 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI S2, S2, 0x20 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD P1, P1, T1 | |||
blt ZERO, I, .L_M2_N8 | |||
.L_M2_N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_M2_N3 | |||
.L_M2_N4: | |||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 | |||
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10 | |||
PTR_ADDI S1, S1, 0x10 | |||
PTR_ADDI S2, S2, 0x10 | |||
PTR_ADDI P2, P2, 0x20 | |||
.L_M2_N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_M2_N1 | |||
.L_M2_N2: | |||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 | |||
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08 | |||
PTR_ADDI S1, S1, 0x08 | |||
PTR_ADDI S2, S2, 0x08 | |||
PTR_ADDI P3, P3, 0x10 | |||
.L_M2_N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M1 | |||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 | |||
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04 | |||
PTR_ADDI S1, S1, 0x04 | |||
PTR_ADDI S2, S2, 0x04 | |||
PTR_ADDI P4, P4, 0x08 | |||
.L_M1: | |||
andi J, M, 0x01 | |||
beq ZERO, J, .L_M0 | |||
move S1, S0 | |||
PTR_ADD S2, S0, TL | |||
move P1, P0 | |||
PTR_ADDI P0, P0, 0x20 | |||
PTR_SRAI I, N, 0x03 | |||
beq ZERO, I, .L_M1_N7 | |||
.align 5 | |||
.L_M1_N8: | |||
xvld U0, S1, 0x00 | |||
GST xv, , U0, P1, 0x00 | |||
PTR_ADDI S1, S1, 0x20 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD P1, P1, T1 | |||
blt ZERO, I, .L_M1_N8 | |||
.L_M1_N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_M1_N3 | |||
.L_M1_N4: | |||
GLD v, , $vr0, S1, 0x00 | |||
GST v, , $vr0, P2, 0x00 | |||
PTR_ADDI S1, S1, 0x10 | |||
PTR_ADDI P2, P2, 0x10 | |||
.L_M1_N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_M1_N1 | |||
.L_M1_N2: | |||
GLD f, d, $f0, S1, 0x00 | |||
GST f, d, $f0, P3, 0x00 | |||
PTR_ADDI S1, S1, 0x08 | |||
PTR_ADDI P3, P3, 0x08 | |||
.L_M1_N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M0 | |||
GLD f, s, $f0, S1, 0x00 | |||
GST f, s, $f0, P4, 0x00 | |||
PTR_ADDI S1, S1, 0x04 | |||
PTR_ADDI P4, P4, 0x04 | |||
.L_M0: | |||
pop_if_used 23, 8 | |||
jirl $r0, $r1, 0x00 | |||
EPILOGUE |
@@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fmov.d s2, s1 | |||
bge $r0, N, .L999 | |||
slli.d INCX, INCX, BASE_SHIFT | |||
bge $r0, INCX, .L999 | |||
beq $r0, INCX, .L999 | |||
srai.d I, N, 3 | |||
bne INCX, TEMP, .L20 | |||
bge $r0, I, .L15 | |||
@@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
MTC s1, $r0 | |||
bge $r0, N, .L999 | |||
slli.d INCX, INCX, ZBASE_SHIFT | |||
bge $r0, INCX, .L999 | |||
beq $r0, INCX, .L999 | |||
move XX, X | |||
MOV s2, s1 | |||
srai.d I, N, 2 | |||
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
FLOAT absxi = 0.0; | |||
if (n <= 0 || inc_x <= 0) return(0.0); | |||
if (n <= 0 || inc_x == 0) return(0.0); | |||
if ( n == 1 ) return( ABS(x[0]) ); | |||
n *= inc_x; | |||
@@ -48,7 +48,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
BLASLONG inc_x2; | |||
FLOAT temp; | |||
if (n <= 0 || inc_x <= 0) return(0.0); | |||
if (n <= 0 || inc_x == 0) return(0.0); | |||
inc_x2 = 2 * inc_x; | |||
@@ -77,7 +77,7 @@ | |||
blez N, .L999 | |||
mov.d s2, s1 | |||
blez INCX, .L999 | |||
beqz INCX, .L999 | |||
dsll INCX, INCX, ZBASE_SHIFT | |||
dsra I, N, 2 | |||
@@ -81,7 +81,7 @@ | |||
blez N, .L999 | |||
MTC $0, s1 | |||
blez INCX, .L999 | |||
beqz INCX, .L999 | |||
dsll INCX, INCX, BASE_SHIFT | |||
move XX, X | |||
@@ -77,7 +77,7 @@ | |||
blez N, .L999 | |||
mov.d s2, s1 | |||
blez INCX, .L999 | |||
beqz INCX, .L999 | |||
dsll INCX, INCX, BASE_SHIFT | |||
bne INCX, TEMP, .L20 | |||
@@ -80,7 +80,7 @@ | |||
blez N, .L999 | |||
MTC $0, s1 | |||
blez INCX, .L999 | |||
beqz INCX, .L999 | |||
dsll INCX, INCX, ZBASE_SHIFT | |||
move XX, X | |||
@@ -99,7 +99,7 @@ | |||
cmpwi cr0, N, 0 | |||
ble- LL(9999) | |||
cmpwi cr0, INCX, 0 | |||
ble- LL(9999) | |||
beq- LL(9999) | |||
fmr f0, f1 | |||
fmr f2, f1 | |||
@@ -119,7 +119,7 @@ | |||
cmpwi cr0, N, 0 | |||
ble LL(99) | |||
cmpwi cr0, INCX, 0 | |||
ble LL(99) | |||
beq LL(99) | |||
andi. r0, X, 2 * SIZE - 1 | |||
bne LL(100) | |||
@@ -104,7 +104,7 @@ | |||
cmpwi cr0, N, 0 | |||
ble- LL(999) | |||
cmpwi cr0, INCX, 0 | |||
ble- LL(999) | |||
beq- LL(999) | |||
fmr f0, f1 | |||
sub X, X, INCX | |||