@@ -1,44 +1,44 @@ | |||
macos_instance: | |||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
task: | |||
name: AppleM1/LLVM | |||
compile_script: | |||
- brew install llvm | |||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
- make TARGET=VORTEX USE_OPENMP=1 CC=clang | |||
#task: | |||
# name: AppleM1/LLVM | |||
# compile_script: | |||
# - brew install llvm | |||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang | |||
task: | |||
name: AppleM1/LLVM/ILP64 | |||
compile_script: | |||
- brew install llvm | |||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 | |||
#task: | |||
# name: AppleM1/LLVM/ILP64 | |||
# compile_script: | |||
# - brew install llvm | |||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 | |||
task: | |||
name: AppleM1/LLVM/CMAKE | |||
compile_script: | |||
- brew install llvm | |||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
- mkdir build | |||
- cd build | |||
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. | |||
- make -j 4 | |||
#task: | |||
# name: AppleM1/LLVM/CMAKE | |||
# compile_script: | |||
# - brew install llvm | |||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
# - mkdir build | |||
# - cd build | |||
# - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. | |||
# - make -j 4 | |||
task: | |||
name: AppleM1/GCC/MAKE/OPENMP | |||
compile_script: | |||
- brew install gcc@11 | |||
- export PATH=/opt/homebrew/bin:$PATH | |||
- export LDFLAGS="-L/opt/homebrew/lib" | |||
- export CPPFLAGS="-I/opt/homebrew/include" | |||
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 | |||
#task: | |||
# name: AppleM1/GCC/MAKE/OPENMP | |||
# compile_script: | |||
# - brew install gcc@11 | |||
# - export PATH=/opt/homebrew/bin:$PATH | |||
# - export LDFLAGS="-L/opt/homebrew/lib" | |||
# - export CPPFLAGS="-I/opt/homebrew/include" | |||
# - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 | |||
macos_instance: | |||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
@@ -0,0 +1,149 @@ | |||
name: apple m | |||
on: [push, pull_request] | |||
concurrency: | |||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | |||
cancel-in-progress: true | |||
permissions: | |||
contents: read # to fetch code (actions/checkout) | |||
jobs: | |||
build: | |||
if: "github.repository == 'OpenMathLib/OpenBLAS'" | |||
runs-on: macos-14 | |||
strategy: | |||
fail-fast: false | |||
matrix: | |||
build: [cmake, make] | |||
fortran: [gfortran] | |||
openmp: [0, 1] | |||
ilp64: [0, 1] | |||
steps: | |||
- name: Checkout repository | |||
uses: actions/checkout@v3 | |||
- name: Print system information | |||
run: | | |||
if [ "$RUNNER_OS" == "Linux" ]; then | |||
cat /proc/cpuinfo | |||
elif [ "$RUNNER_OS" == "macOS" ]; then | |||
sysctl -a | grep machdep.cpu | |||
else | |||
echo "::error::$RUNNER_OS not supported" | |||
exit 1 | |||
fi | |||
- name: Install Dependencies | |||
run: | | |||
if [ "$RUNNER_OS" == "Linux" ]; then | |||
sudo apt-get install -y gfortran cmake ccache libtinfo5 | |||
elif [ "$RUNNER_OS" == "macOS" ]; then | |||
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | |||
brew reinstall gcc | |||
brew install coreutils cmake ccache | |||
brew install llvm | |||
else | |||
echo "::error::$RUNNER_OS not supported" | |||
exit 1 | |||
fi | |||
- name: Compilation cache | |||
uses: actions/cache@v3 | |||
with: | |||
path: ~/.ccache | |||
# We include the commit sha in the cache key, as new cache entries are | |||
# only created if there is no existing entry for the key yet. | |||
# GNU make and cmake call the compilers differently. It looks like | |||
# that causes the cache to mismatch. Keep the ccache for both build | |||
# tools separate to avoid polluting each other. | |||
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} | |||
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. | |||
restore-keys: | | |||
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }} | |||
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }} | |||
ccache-${{ runner.os }}-${{ matrix.build }} | |||
- name: Configure ccache | |||
run: | | |||
if [ "${{ matrix.build }}" = "make" ]; then | |||
# Add ccache to path | |||
if [ "$RUNNER_OS" = "Linux" ]; then | |||
echo "/usr/lib/ccache" >> $GITHUB_PATH | |||
elif [ "$RUNNER_OS" = "macOS" ]; then | |||
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH | |||
echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH | |||
echo "" >>$GITHUB_PATH | |||
else | |||
echo "::error::$RUNNER_OS not supported" | |||
exit 1 | |||
fi | |||
fi | |||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). | |||
test -d ~/.ccache || mkdir -p ~/.ccache | |||
echo "max_size = 300M" > ~/.ccache/ccache.conf | |||
echo "compression = true" >> ~/.ccache/ccache.conf | |||
ccache -s | |||
- name: Build OpenBLAS | |||
run: | | |||
export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
export CC="/opt/homebrew/opt/llvm/bin/clang" | |||
case "${{ matrix.build }}" in | |||
"make") | |||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}" | |||
;; | |||
"cmake") | |||
export LDFLAGS="$LDFLAGS -Wl,-ld_classic" | |||
mkdir build && cd build | |||
cmake -DDYNAMIC_ARCH=1 \ | |||
-DUSE_OPENMP=${{matrix.openmp}} \ | |||
-DINTERFACE64=${{matrix.ilp64}} \ | |||
-DNOFORTRAN=0 \ | |||
-DBUILD_WITHOUT_LAPACK=0 \ | |||
-DCMAKE_VERBOSE_MAKEFILE=ON \ | |||
-DCMAKE_BUILD_TYPE=Release \ | |||
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ | |||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \ | |||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ | |||
.. | |||
cmake --build . | |||
;; | |||
*) | |||
echo "::error::Configuration not supported" | |||
exit 1 | |||
;; | |||
esac | |||
- name: Show ccache status | |||
continue-on-error: true | |||
run: ccache -s | |||
- name: Run tests | |||
timeout-minutes: 60 | |||
run: | | |||
case "${{ matrix.build }}" in | |||
"make") | |||
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0' | |||
echo "::group::Tests in 'test' directory" | |||
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" | |||
echo "::endgroup::" | |||
echo "::group::Tests in 'ctest' directory" | |||
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" | |||
echo "::endgroup::" | |||
echo "::group::Tests in 'utest' directory" | |||
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" | |||
echo "::endgroup::" | |||
;; | |||
"cmake") | |||
cd build && ctest | |||
;; | |||
*) | |||
echo "::error::Configuration not supported" | |||
exit 1 | |||
;; | |||
esac |
@@ -0,0 +1,253 @@ | |||
name: riscv64 zvl256b qemu test | |||
on: [push, pull_request] | |||
concurrency: | |||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | |||
cancel-in-progress: true | |||
permissions: | |||
contents: read # to fetch code (actions/checkout) | |||
jobs: | |||
TEST: | |||
if: "github.repository == 'OpenMathLib/OpenBLAS'" | |||
runs-on: ubuntu-latest | |||
env: | |||
triple: riscv64-unknown-linux-gnu | |||
riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain | |||
riscv_gnu_toolchain_version: 13.2.0 | |||
riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz | |||
strategy: | |||
fail-fast: false | |||
matrix: | |||
include: | |||
- target: RISCV64_ZVL128B | |||
opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64 | |||
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64 | |||
- target: RISCV64_ZVL256B | |||
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64 | |||
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 | |||
steps: | |||
- name: Checkout repository | |||
uses: actions/checkout@v3 | |||
- name: install build deps | |||
run: | | |||
sudo apt-get update | |||
sudo apt-get install autoconf automake autotools-dev ninja-build make \ | |||
libgomp1-riscv64-cross ccache | |||
wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path} | |||
tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt | |||
- name: Compilation cache | |||
uses: actions/cache@v3 | |||
with: | |||
path: ~/.ccache | |||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} | |||
restore-keys: | | |||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} | |||
ccache-${{ runner.os }}-${{ matrix.target }} | |||
- name: Configure ccache | |||
run: | | |||
test -d ~/.ccache || mkdir -p ~/.ccache | |||
echo "max_size = 300M" > ~/.ccache/ccache.conf | |||
echo "compression = true" >> ~/.ccache/ccache.conf | |||
ccache -s | |||
- name: build OpenBLAS libs | |||
run: | | |||
export PATH="/opt/riscv/bin:$PATH" | |||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ | |||
CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \ | |||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ | |||
RANLIB='ccache ${triple}-ranlib' \ | |||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ | |||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) | |||
- name: build OpenBLAS tests | |||
run: | | |||
export PATH="/opt/riscv/bin:$PATH" | |||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ | |||
CC='${triple}-gcc' \ | |||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ | |||
RANLIB='ccache ${triple}-ranlib' \ | |||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ | |||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests | |||
- name: build lapack-netlib tests | |||
working-directory: ./lapack-netlib/TESTING | |||
run: | | |||
export PATH="/opt/riscv/bin:$PATH" | |||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ | |||
CC='${triple}-gcc' \ | |||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ | |||
RANLIB='ccache ${triple}-ranlib' \ | |||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ | |||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \ | |||
LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \ | |||
LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \ | |||
LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \ | |||
- name: OpenBLAS tests | |||
shell: bash | |||
run: | | |||
export PATH="/opt/riscv/bin:$PATH" | |||
export QEMU_CPU=${{ matrix.qemu_cpu }} | |||
rm -rf ./test_out | |||
mkdir -p ./test_out | |||
run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \ | |||
echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \ | |||
if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \ | |||
else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \ | |||
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \ | |||
} | |||
run_test test cblat1 & | |||
run_test test cblat2 cblat2.dat & | |||
run_test test cblat3 cblat3.dat & | |||
run_test test dblat1 & | |||
run_test test dblat2 dblat2.dat & | |||
run_test test dblat3 dblat3.dat & | |||
run_test test sblat1 & | |||
run_test test sblat2 sblat2.dat & | |||
run_test test sblat3 sblat3.dat & | |||
run_test test zblat1 & | |||
run_test test zblat2 zblat2.dat & | |||
run_test test zblat3 zblat3.dat & | |||
run_test ctest xccblat1 & | |||
run_test ctest xccblat2 cin2 & | |||
run_test ctest xccblat3 cin3 & | |||
run_test ctest xdcblat1 & | |||
run_test ctest xdcblat2 din2 & | |||
run_test ctest xdcblat3 din3 & | |||
run_test ctest xscblat1 & | |||
run_test ctest xscblat2 sin2 & | |||
run_test ctest xscblat3 sin3 & | |||
run_test ctest xzcblat1 & | |||
run_test ctest xzcblat2 zin2 & | |||
run_test ctest xzcblat3 zin3 & | |||
wait | |||
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) | |||
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi | |||
- name: netlib tests | |||
shell: bash | |||
run: | | |||
: # these take a very long time | |||
echo "Skipping netlib tests in CI" | |||
exit 0 | |||
: # comment out exit above to enable the tests | |||
: # probably we want to identify a subset to run in CI | |||
export PATH="/opt/riscv/bin:$PATH" | |||
export QEMU_CPU=${{ matrix.qemu_cpu }} | |||
rm -rf ./test_out | |||
mkdir -p ./test_out | |||
run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \ | |||
echo "$4" >> $OUTPUT; \ | |||
echo "$CMD" >> $OUTPUT; \ | |||
qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \ | |||
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \ | |||
if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \ | |||
if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \ | |||
} | |||
run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" & | |||
run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" & | |||
run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" & | |||
run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" & | |||
run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" & | |||
run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" & | |||
run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" & | |||
run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" & | |||
run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" & | |||
run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" & | |||
run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & | |||
run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" & | |||
run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" & | |||
run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" & | |||
run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" & | |||
run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" & | |||
run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" & | |||
run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" & | |||
run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" & | |||
run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" & | |||
run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" & | |||
run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" & | |||
run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" & | |||
run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" & | |||
run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" & | |||
run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" & | |||
run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" & | |||
run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" & | |||
run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & | |||
run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" & | |||
run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" & | |||
run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" & | |||
run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" & | |||
run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" & | |||
run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" & | |||
run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" & | |||
run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" & | |||
run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" & | |||
run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" & | |||
run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" & | |||
run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" & | |||
run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" & | |||
run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" & | |||
run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" & | |||
run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" & | |||
run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" & | |||
run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & | |||
run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" & | |||
run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" & | |||
run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" & | |||
run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" & | |||
run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" & | |||
run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" & | |||
run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" & | |||
run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" & | |||
run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" & | |||
run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" & | |||
run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" & | |||
run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" & | |||
run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" & | |||
run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" & | |||
run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" & | |||
run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" & | |||
run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" & | |||
run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & | |||
run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" & | |||
run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" & | |||
run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" & | |||
run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" & | |||
run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" & | |||
run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" & | |||
run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" & | |||
run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" & | |||
run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" & | |||
run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" & | |||
run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" & | |||
run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" & | |||
run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" & | |||
run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" & | |||
run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" & | |||
run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" & | |||
run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" & | |||
wait | |||
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) | |||
python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary | |||
TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)" | |||
NUMERICAL_ERRORS=-1 | |||
OTHER_ERRORS=-1 | |||
. <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary | |||
if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi | |||
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi |
@@ -219,6 +219,7 @@ In chronological order: | |||
* Mark Seminatore <https://github.com/mseminatore> | |||
* [2023-11-09] Improve Windows threading performance scaling | |||
* [2024-02-09] Introduce MT_TRACE facility and improve code consistency | |||
* Dirreke <https://github.com/mseminatore> | |||
* [2024-01-16] Add basic support for the CSKY architecture |
@@ -156,6 +156,9 @@ endif | |||
ifeq ($(OSNAME), CYGWIN_NT) | |||
@$(MAKE) -C exports dll | |||
endif | |||
ifeq ($(OSNAME), AIX) | |||
@$(MAKE) -C exports so | |||
endif | |||
endif | |||
tests : shared | |||
@@ -1715,11 +1715,7 @@ endif | |||
LIBDLLNAME = $(LIBPREFIX).dll | |||
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a | |||
ifneq ($(OSNAME), AIX) | |||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) | |||
else | |||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) | |||
endif | |||
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) | |||
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) | |||
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) | |||
@@ -203,6 +203,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran | |||
``` | |||
- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available. | |||
e.g.: | |||
```sh | |||
make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \ | |||
BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \ | |||
AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \ | |||
LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \ | |||
HOSTCC=gcc HOSTFC=gfortran -j | |||
``` | |||
### Support for multiple targets in a single library | |||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. | |||
@@ -64,6 +64,7 @@ else () | |||
"#define NEEDBUNDERSCORE 1\n") | |||
endif() | |||
if (CMAKE_Fortran_COMPILER) | |||
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE) | |||
string(TOUPPER ${F_COMPILER} F_COMPILER) | |||
endif() |
@@ -6,9 +6,6 @@ | |||
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
# This is for classic Flang. LLVM Flang is handled with gfortran below. | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
if (BINARY64 AND INTERFACE64) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8") | |||
endif () | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
endif () | |||
@@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F | |||
if (MIPS64) | |||
if (BINARY64) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") | |||
if (INTERFACE64) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") | |||
endif () | |||
else () | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") | |||
endif () | |||
@@ -83,6 +83,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F | |||
endif () | |||
endif () | |||
endif () | |||
if (ARM64 AND INTERFACE64) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") | |||
endif () | |||
else () | |||
if (BINARY64) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | |||
@@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
#define BUFFER_SIZE ( 32 << 20) | |||
#define SEEK_ADDRESS | |||
#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) | |||
#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280) | |||
# include <riscv_vector.h> | |||
#endif | |||
@@ -40,6 +40,10 @@ else() | |||
c_${float_char}blas1.c) | |||
endif() | |||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | |||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
target_link_libraries(x${float_char}cblat1 omp pthread) | |||
endif() | |||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
target_link_libraries(x${float_char}cblat1 m) | |||
endif() | |||
@@ -65,6 +69,10 @@ else() | |||
constant.c) | |||
endif() | |||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | |||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
target_link_libraries(x${float_char}cblat2 omp pthread) | |||
endif() | |||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
target_link_libraries(x${float_char}cblat2 m) | |||
endif() | |||
@@ -90,6 +98,10 @@ else() | |||
constant.c) | |||
endif() | |||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | |||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
target_link_libraries(x${float_char}cblat3 omp pthread) | |||
endif() | |||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
target_link_libraries(x${float_char}cblat3 m) | |||
endif() | |||
@@ -48,6 +48,12 @@ | |||
#endif | |||
#endif | |||
#ifdef SMP_DEBUG | |||
# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__) | |||
#else | |||
# define MT_TRACE(...) | |||
#endif | |||
/* This is a thread implementation for Win32 lazy implementation */ | |||
/* Thread server common information */ | |||
@@ -68,19 +74,12 @@ static HANDLE blas_threads [MAX_CPU_NUMBER]; | |||
static DWORD blas_threads_id[MAX_CPU_NUMBER]; | |||
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads | |||
#if defined (__GNUC__) && (__GNUC__ < 6) | |||
#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) | |||
#else | |||
#if defined(_WIN64) | |||
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) | |||
#else | |||
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) | |||
#endif | |||
#endif | |||
// | |||
// Legacy code path | |||
// | |||
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { | |||
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
if (!(mode & BLAS_COMPLEX)){ | |||
if (!(mode & BLAS_COMPLEX)) { | |||
#ifdef EXPRECISION | |||
if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
/* REAL / Extended Double */ | |||
@@ -95,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
args -> c, args -> ldc, sb); | |||
} else | |||
#endif | |||
if ((mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
if ((mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
/* REAL / Double */ | |||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
double *, BLASLONG, double *, BLASLONG, | |||
@@ -106,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
args -> a, args -> lda, | |||
args -> b, args -> ldb, | |||
args -> c, args -> ldc, sb); | |||
} else if ((mode & BLAS_PREC) == BLAS_SINGLE){ | |||
} else if ((mode & BLAS_PREC) == BLAS_SINGLE) { | |||
/* REAL / Single */ | |||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
float *, BLASLONG, float *, BLASLONG, | |||
@@ -118,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
args -> b, args -> ldb, | |||
args -> c, args -> ldc, sb); | |||
#ifdef BUILD_BFLOAT16 | |||
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ | |||
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) { | |||
/* REAL / BFLOAT16 */ | |||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | |||
@@ -129,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
args -> a, args -> lda, | |||
args -> b, args -> ldb, | |||
args -> c, args -> ldc, sb); | |||
} else if ((mode & BLAS_PREC) == BLAS_STOBF16){ | |||
} else if ((mode & BLAS_PREC) == BLAS_STOBF16) { | |||
/* REAL / BLAS_STOBF16 */ | |||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
float *, BLASLONG, bfloat16 *, BLASLONG, | |||
@@ -140,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
args -> a, args -> lda, | |||
args -> b, args -> ldb, | |||
args -> c, args -> ldc, sb); | |||
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ | |||
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16) { | |||
/* REAL / BLAS_DTOBF16 */ | |||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
double *, BLASLONG, bfloat16 *, BLASLONG, | |||
@@ -157,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
} | |||
} else { | |||
#ifdef EXPRECISION | |||
if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
if ((mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||
/* COMPLEX / Extended Double */ | |||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | |||
xdouble *, BLASLONG, xdouble *, BLASLONG, | |||
@@ -171,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
args -> c, args -> ldc, sb); | |||
} else | |||
#endif | |||
if ((mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
if ((mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
/* COMPLEX / Double */ | |||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, | |||
double *, BLASLONG, double *, BLASLONG, | |||
@@ -201,10 +200,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
} | |||
} | |||
/* This is a main routine of threads. Each thread waits until job is */ | |||
/* queued. */ | |||
static DWORD WINAPI blas_thread_server(void *arg){ | |||
// | |||
// This is a main routine of threads. Each thread waits until job is queued. | |||
// | |||
static DWORD WINAPI blas_thread_server(void *arg) { | |||
/* Thread identifier */ | |||
BLASLONG cpu = (BLASLONG)arg; | |||
@@ -215,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
/* Each server needs each buffer */ | |||
buffer = blas_memory_alloc(2); | |||
#ifdef SMP_DEBUG | |||
fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); | |||
#endif | |||
MT_TRACE("Server[%2ld] Thread is started!\n", cpu); | |||
while (1){ | |||
while (1) { | |||
/* Waiting for Queue */ | |||
#ifdef SMP_DEBUG | |||
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); | |||
#endif | |||
// event raised when work is added to the queue | |||
WaitForSingleObject(kickoff_event, INFINITE); | |||
MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); | |||
if (cpu > thread_target - 2) | |||
{ | |||
//printf("thread [%d] exiting.\n", cpu); | |||
break; // excess thread, so worker thread exits | |||
} | |||
// event raised when work is added to the queue | |||
WaitForSingleObject(kickoff_event, INFINITE); | |||
#ifdef SMP_DEBUG | |||
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); | |||
#endif | |||
if (cpu > thread_target - 2) { | |||
//MT_TRACE("thread [%d] exiting.\n", cpu); | |||
break; // excess thread, so worker thread exits | |||
} | |||
MT_TRACE("Server[%2ld] Got it.\n", cpu); | |||
#if 1 | |||
EnterCriticalSection(&queue_lock); | |||
queue = work_queue; | |||
@@ -247,53 +239,39 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
work_queue = work_queue->next; | |||
LeaveCriticalSection(&queue_lock); | |||
#else | |||
volatile blas_queue_t* queue_next; | |||
INT_PTR prev_value; | |||
do { | |||
queue = (volatile blas_queue_t*)work_queue; | |||
if (!queue) | |||
break; | |||
queue_next = (volatile blas_queue_t*)queue->next; | |||
prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue); | |||
} while (prev_value != queue); | |||
#endif | |||
if (queue) { | |||
if (queue) { | |||
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | |||
sa = queue -> sa; | |||
sb = queue -> sb; | |||
#ifdef CONSISTENT_FPCSR | |||
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||
#endif | |||
#ifdef CONSISTENT_FPCSR | |||
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||
#endif | |||
#ifdef SMP_DEBUG | |||
fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||
MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); | |||
#endif | |||
// fprintf(stderr, "queue start[%ld]!!!\n", cpu); | |||
#ifdef MONITOR | |||
main_status[cpu] = MAIN_RUNNING1; | |||
#endif | |||
#ifdef MONITOR | |||
main_status[cpu] = MAIN_RUNNING1; | |||
#endif | |||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
if (sa == NULL) | |||
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
if (sb == NULL) { | |||
if (!(queue -> mode & BLAS_COMPLEX)){ | |||
if (!(queue -> mode & BLAS_COMPLEX)) { | |||
#ifdef EXPRECISION | |||
if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) | |||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
} else | |||
#endif | |||
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
#ifdef BUILD_DOUBLE | |||
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | |||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
@@ -327,65 +305,58 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
/* Other types in future */ | |||
} | |||
} | |||
queue->sb=sb; | |||
queue->sb=sb; | |||
} | |||
#ifdef MONITOR | |||
main_status[cpu] = MAIN_RUNNING2; | |||
#endif | |||
#ifdef MONITOR | |||
main_status[cpu] = MAIN_RUNNING2; | |||
#endif | |||
if (!(queue -> mode & BLAS_LEGACY)) { | |||
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||
} else { | |||
legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
} | |||
}else{ | |||
continue; //if queue == NULL | |||
} | |||
} else { | |||
continue; //if queue == NULL | |||
} | |||
#ifdef SMP_DEBUG | |||
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); | |||
#endif | |||
MT_TRACE("Server[%2ld] Finished!\n", cpu); | |||
queue->finished = 1; | |||
queue->finished = 1; | |||
} | |||
/* Shutdown procedure */ | |||
#ifdef SMP_DEBUG | |||
fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); | |||
#endif | |||
MT_TRACE("Server[%2ld] Shutdown!\n", cpu); | |||
blas_memory_free(buffer); | |||
return 0; | |||
} | |||
} | |||
/* Initializing routine */ | |||
int blas_thread_init(void){ | |||
// | |||
// Initializing routine | |||
// | |||
int blas_thread_init(void) { | |||
BLASLONG i; | |||
if (blas_server_avail || (blas_cpu_number <= 1)) return 0; | |||
LOCK_COMMAND(&server_lock); | |||
#ifdef SMP_DEBUG | |||
fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", | |||
blas_cpu_number); | |||
#endif | |||
MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); | |||
if (!blas_server_avail){ | |||
// create the kickoff Event | |||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
if (!blas_server_avail) { | |||
// create the kickoff Event | |||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
thread_target = blas_cpu_number; | |||
thread_target = blas_cpu_number; | |||
InitializeCriticalSection(&queue_lock); | |||
for(i = 0; i < blas_cpu_number - 1; i++){ | |||
//printf("thread_init: creating thread [%d]\n", i); | |||
for(i = 0; i < blas_cpu_number - 1; i++) { | |||
//MT_TRACE("thread_init: creating thread [%d]\n", i); | |||
blas_threads[i] = CreateThread(NULL, 0, | |||
blas_thread_server, (void *)i, | |||
@@ -400,15 +371,12 @@ int blas_thread_init(void){ | |||
return 0; | |||
} | |||
/* | |||
User can call one of two routines. | |||
exec_blas_async ... immediately returns after jobs are queued. | |||
exec_blas ... returns after jobs are finished. | |||
*/ | |||
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
// | |||
// User can call one of two routines. | |||
// exec_blas_async ... immediately returns after jobs are queued. | |||
// exec_blas ... returns after jobs are finished. | |||
// | |||
int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { | |||
#if defined(SMP_SERVER) | |||
// Handle lazy re-init of the thread-pool after a POSIX fork | |||
@@ -428,7 +396,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
__asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); | |||
#endif | |||
current->finished = 0; | |||
current->finished = 0; | |||
current = current -> next; | |||
pos ++; | |||
} | |||
@@ -437,18 +405,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
if (!work_queue) | |||
{ | |||
work_queue = queue; | |||
work_queue = queue; | |||
} | |||
else | |||
{ | |||
blas_queue_t *next_item = work_queue; | |||
// find the end of the work queue | |||
while (next_item) | |||
next_item = next_item->next; | |||
// find the end of the work queue | |||
while (next_item) | |||
next_item = next_item->next; | |||
// add new work to the end | |||
next_item = queue; | |||
// add new work to the end | |||
next_item = queue; | |||
} | |||
LeaveCriticalSection(&queue_lock); | |||
@@ -458,26 +426,25 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
return 0; | |||
} | |||
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||
// | |||
// Join. Wait for all queued tasks to complete | |||
// | |||
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) { | |||
#ifdef SMP_DEBUG | |||
fprintf(STDERR, "Synchronization Waiting.\n"); | |||
#endif | |||
MT_TRACE("Synchronization Waiting.\n"); | |||
while (num){ | |||
#ifdef SMP_DEBUG | |||
fprintf(STDERR, "Waiting Queue ..\n"); | |||
#endif | |||
while (!queue->finished) | |||
YIELDING; | |||
while (num) { | |||
MT_TRACE("Waiting Queue ..\n"); | |||
queue = queue->next; | |||
num--; | |||
} | |||
while (!queue->finished) | |||
YIELDING; | |||
queue = queue->next; | |||
num--; | |||
} | |||
MT_TRACE("Completely Done.\n\n"); | |||
#ifdef SMP_DEBUG | |||
fprintf(STDERR, "Completely Done.\n\n"); | |||
#endif | |||
// if work was added to the queue after this batch we can't sleep the worker threads | |||
// by resetting the event | |||
EnterCriticalSection(&queue_lock); | |||
@@ -490,8 +457,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||
return 0; | |||
} | |||
/* Execute Threads */ | |||
int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
// | |||
// Execute Threads | |||
// | |||
int exec_blas(BLASLONG num, blas_queue_t *queue) { | |||
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | |||
// Handle lazy re-init of the thread-pool after a POSIX fork | |||
@@ -504,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
if ((num <= 0) || (queue == NULL)) return 0; | |||
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); | |||
if ((num > 1) && queue -> next) | |||
exec_blas_async(1, queue -> next); | |||
routine = queue -> routine; | |||
if (queue -> mode & BLAS_LEGACY) { | |||
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); | |||
} else | |||
} else { | |||
if (queue -> mode & BLAS_PTHREAD) { | |||
void (*pthreadcompat)(void *) = queue -> routine; | |||
(pthreadcompat)(queue -> args); | |||
} else | |||
(routine)(queue -> args, queue -> range_m, queue -> range_n, | |||
queue -> sa, queue -> sb, 0); | |||
queue -> sa, queue -> sb, 0); | |||
} | |||
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); | |||
if ((num > 1) && queue -> next) | |||
exec_blas_async_wait(num - 1, queue -> next); | |||
return 0; | |||
} | |||
/* Shutdown procedure, but user don't have to call this routine. The */ | |||
/* kernel automatically kill threads. */ | |||
int BLASFUNC(blas_thread_shutdown)(void){ | |||
// | |||
// Shutdown procedure, but user don't have to call this routine. The | |||
// kernel automatically kill threads. | |||
// | |||
int BLASFUNC(blas_thread_shutdown)(void) { | |||
int i; | |||
@@ -534,9 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
LOCK_COMMAND(&server_lock); | |||
if (blas_server_avail){ | |||
if (blas_server_avail) { | |||
for(i = 0; i < blas_num_threads - 1; i++){ | |||
for (i = 0; i < blas_num_threads - 1; i++) { | |||
// Could also just use WaitForMultipleObjects | |||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); | |||
@@ -558,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
return 0; | |||
} | |||
// | |||
// Legacy function to set numbef of threads | |||
// | |||
void goto_set_num_threads(int num_threads) | |||
{ | |||
long i; | |||
@@ -571,7 +547,7 @@ void goto_set_num_threads(int num_threads) | |||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
if (blas_server_avail && num_threads < blas_num_threads) { | |||
if (blas_server_avail && num_threads < blas_num_threads) { | |||
LOCK_COMMAND(&server_lock); | |||
thread_target = num_threads; | |||
@@ -579,11 +555,11 @@ void goto_set_num_threads(int num_threads) | |||
SetEvent(kickoff_event); | |||
for (i = num_threads - 1; i < blas_num_threads - 1; i++) { | |||
//printf("set_num_threads: waiting on thread [%d] to quit.\n", i); | |||
//MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i); | |||
WaitForSingleObject(blas_threads[i], INFINITE); | |||
//printf("set_num_threads: thread [%d] has quit.\n", i); | |||
//MT_TRACE("set_num_threads: thread [%d] has quit.\n", i); | |||
CloseHandle(blas_threads[i]); | |||
} | |||
@@ -601,8 +577,8 @@ void goto_set_num_threads(int num_threads) | |||
thread_target = num_threads; | |||
//increased_threads = 1; | |||
if (!blas_server_avail){ | |||
//increased_threads = 1; | |||
if (!blas_server_avail) { | |||
// create the kickoff Event | |||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
@@ -611,8 +587,8 @@ void goto_set_num_threads(int num_threads) | |||
blas_server_avail = 1; | |||
} | |||
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||
//printf("set_num_threads: creating thread [%d]\n", i); | |||
for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { | |||
//MT_TRACE("set_num_threads: creating thread [%d]\n", i); | |||
blas_threads[i] = CreateThread(NULL, 0, | |||
blas_thread_server, (void *)i, | |||
@@ -627,6 +603,9 @@ void goto_set_num_threads(int num_threads) | |||
blas_cpu_number = num_threads; | |||
} | |||
// | |||
// Openblas function to set thread count | |||
// | |||
void openblas_set_num_threads(int num) | |||
{ | |||
goto_set_num_threads(num); | |||
@@ -73,6 +73,10 @@ endif | |||
endif | |||
endif | |||
ifeq ($(F_COMPILER)$(OSNAME), IBMAIX) | |||
EXTRALIB += -lxlf90 | |||
endif | |||
ifeq ($(C_COMPILER), PGI) | |||
EXTRALIB += -pgf90libs | |||
endif | |||
@@ -256,6 +260,20 @@ endif | |||
ifeq ($(OSNAME), AIX) | |||
so : ../$(LIBSONAME) linktest.c | |||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK. | |||
rm -f linktest | |||
../$(LIBSONAME) : aix.exp | |||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
-Wl,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB) | |||
aix.exp : | |||
/usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \ | |||
|| ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \ | |||
{ if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \ | |||
/usr/bin/sort -u > aix.exp | |||
ifeq ($(COMPILER_F77), xlf) | |||
goto32.$(SUFFIX) : ../$(LIBNAME) aix.def | |||
@@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include <sys/sysinfo.h> | |||
#include <unistd.h> | |||
#endif | |||
#if defined(AIX) | |||
#if defined(_AIX) | |||
#include <unistd.h> | |||
#include <sys/systemcfg.h> | |||
#include <sys/sysinfo.h> | |||
#endif | |||
@@ -1870,11 +1872,13 @@ static int get_num_cores(void) { | |||
return count; | |||
#elif defined(AIX) | |||
#elif defined(_AIX) | |||
//returns the number of processors which are currently online | |||
count = sysconf(_SC_NPROCESSORS_ONLN); | |||
if (count <= 0) count = 2; | |||
return count; | |||
#else | |||
return 2; | |||
#endif | |||
@@ -0,0 +1,587 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
BLASLONG i, j; | |||
FLOAT *aoffset; | |||
FLOAT *aoffset1, *aoffset2; | |||
FLOAT *boffset; | |||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
aoffset = a; | |||
boffset = b; | |||
lda *= 2; | |||
#if 0 | |||
fprintf(stderr, "M = %d N = %d\n", m, n); | |||
#endif | |||
j = (n >> 4); | |||
if (j > 0){ | |||
do{ | |||
aoffset1 = aoffset; | |||
aoffset2 = aoffset + lda; | |||
aoffset += 32; | |||
i = (m >> 1); | |||
if (i > 0){ | |||
do{ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset1 + 1); | |||
ctemp03 = *(aoffset1 + 2); | |||
ctemp04 = *(aoffset1 + 3); | |||
ctemp05 = *(aoffset1 + 4); | |||
ctemp06 = *(aoffset1 + 5); | |||
ctemp07 = *(aoffset1 + 6); | |||
ctemp08 = *(aoffset1 + 7); | |||
ctemp09 = *(aoffset1 + 8); | |||
ctemp10 = *(aoffset1 + 9); | |||
ctemp11 = *(aoffset1 + 10); | |||
ctemp12 = *(aoffset1 + 11); | |||
ctemp13 = *(aoffset1 + 12); | |||
ctemp14 = *(aoffset1 + 13); | |||
ctemp15 = *(aoffset1 + 14); | |||
ctemp16 = *(aoffset1 + 15); | |||
ctemp17 = *(aoffset1 + 16); | |||
ctemp18 = *(aoffset1 + 17); | |||
ctemp19 = *(aoffset1 + 18); | |||
ctemp20 = *(aoffset1 + 19); | |||
ctemp21 = *(aoffset1 + 20); | |||
ctemp22 = *(aoffset1 + 21); | |||
ctemp23 = *(aoffset1 + 22); | |||
ctemp24 = *(aoffset1 + 23); | |||
ctemp25 = *(aoffset1 + 24); | |||
ctemp26 = *(aoffset1 + 25); | |||
ctemp27 = *(aoffset1 + 26); | |||
ctemp28 = *(aoffset1 + 27); | |||
ctemp29 = *(aoffset1 + 28); | |||
ctemp30 = *(aoffset1 + 29); | |||
ctemp31 = *(aoffset1 + 30); | |||
ctemp32 = *(aoffset1 + 31); | |||
ctemp33 = *(aoffset2 + 0); | |||
ctemp34 = *(aoffset2 + 1); | |||
ctemp35 = *(aoffset2 + 2); | |||
ctemp36 = *(aoffset2 + 3); | |||
ctemp37 = *(aoffset2 + 4); | |||
ctemp38 = *(aoffset2 + 5); | |||
ctemp39 = *(aoffset2 + 6); | |||
ctemp40 = *(aoffset2 + 7); | |||
ctemp41 = *(aoffset2 + 8); | |||
ctemp42 = *(aoffset2 + 9); | |||
ctemp43 = *(aoffset2 + 10); | |||
ctemp44 = *(aoffset2 + 11); | |||
ctemp45 = *(aoffset2 + 12); | |||
ctemp46 = *(aoffset2 + 13); | |||
ctemp47 = *(aoffset2 + 14); | |||
ctemp48 = *(aoffset2 + 15); | |||
ctemp49 = *(aoffset2 + 16); | |||
ctemp50 = *(aoffset2 + 17); | |||
ctemp51 = *(aoffset2 + 18); | |||
ctemp52 = *(aoffset2 + 19); | |||
ctemp53 = *(aoffset2 + 20); | |||
ctemp54 = *(aoffset2 + 21); | |||
ctemp55 = *(aoffset2 + 22); | |||
ctemp56 = *(aoffset2 + 23); | |||
ctemp57 = *(aoffset2 + 24); | |||
ctemp58 = *(aoffset2 + 25); | |||
ctemp59 = *(aoffset2 + 26); | |||
ctemp60 = *(aoffset2 + 27); | |||
ctemp61 = *(aoffset2 + 28); | |||
ctemp62 = *(aoffset2 + 29); | |||
ctemp63 = *(aoffset2 + 30); | |||
ctemp64 = *(aoffset2 + 31); | |||
*(boffset + 0) = -ctemp01; | |||
*(boffset + 1) = -ctemp02; | |||
*(boffset + 2) = -ctemp03; | |||
*(boffset + 3) = -ctemp04; | |||
*(boffset + 4) = -ctemp05; | |||
*(boffset + 5) = -ctemp06; | |||
*(boffset + 6) = -ctemp07; | |||
*(boffset + 7) = -ctemp08; | |||
*(boffset + 8) = -ctemp09; | |||
*(boffset + 9) = -ctemp10; | |||
*(boffset + 10) = -ctemp11; | |||
*(boffset + 11) = -ctemp12; | |||
*(boffset + 12) = -ctemp13; | |||
*(boffset + 13) = -ctemp14; | |||
*(boffset + 14) = -ctemp15; | |||
*(boffset + 15) = -ctemp16; | |||
*(boffset + 16) = -ctemp17; | |||
*(boffset + 17) = -ctemp18; | |||
*(boffset + 18) = -ctemp19; | |||
*(boffset + 19) = -ctemp20; | |||
*(boffset + 20) = -ctemp21; | |||
*(boffset + 21) = -ctemp22; | |||
*(boffset + 22) = -ctemp23; | |||
*(boffset + 23) = -ctemp24; | |||
*(boffset + 24) = -ctemp25; | |||
*(boffset + 25) = -ctemp26; | |||
*(boffset + 26) = -ctemp27; | |||
*(boffset + 27) = -ctemp28; | |||
*(boffset + 28) = -ctemp29; | |||
*(boffset + 29) = -ctemp30; | |||
*(boffset + 30) = -ctemp31; | |||
*(boffset + 31) = -ctemp32; | |||
*(boffset + 32) = -ctemp33; | |||
*(boffset + 33) = -ctemp34; | |||
*(boffset + 34) = -ctemp35; | |||
*(boffset + 35) = -ctemp36; | |||
*(boffset + 36) = -ctemp37; | |||
*(boffset + 37) = -ctemp38; | |||
*(boffset + 38) = -ctemp39; | |||
*(boffset + 39) = -ctemp40; | |||
*(boffset + 40) = -ctemp41; | |||
*(boffset + 41) = -ctemp42; | |||
*(boffset + 42) = -ctemp43; | |||
*(boffset + 43) = -ctemp44; | |||
*(boffset + 44) = -ctemp45; | |||
*(boffset + 45) = -ctemp46; | |||
*(boffset + 46) = -ctemp47; | |||
*(boffset + 47) = -ctemp48; | |||
*(boffset + 48) = -ctemp49; | |||
*(boffset + 49) = -ctemp50; | |||
*(boffset + 50) = -ctemp51; | |||
*(boffset + 51) = -ctemp52; | |||
*(boffset + 52) = -ctemp53; | |||
*(boffset + 53) = -ctemp54; | |||
*(boffset + 54) = -ctemp55; | |||
*(boffset + 55) = -ctemp56; | |||
*(boffset + 56) = -ctemp57; | |||
*(boffset + 57) = -ctemp58; | |||
*(boffset + 58) = -ctemp59; | |||
*(boffset + 59) = -ctemp60; | |||
*(boffset + 60) = -ctemp61; | |||
*(boffset + 61) = -ctemp62; | |||
*(boffset + 62) = -ctemp63; | |||
*(boffset + 63) = -ctemp64; | |||
aoffset1 += 2 * lda; | |||
aoffset2 += 2 * lda; | |||
boffset += 64; | |||
i --; | |||
}while(i > 0); | |||
} | |||
if (m & 1){ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset1 + 1); | |||
ctemp03 = *(aoffset1 + 2); | |||
ctemp04 = *(aoffset1 + 3); | |||
ctemp05 = *(aoffset1 + 4); | |||
ctemp06 = *(aoffset1 + 5); | |||
ctemp07 = *(aoffset1 + 6); | |||
ctemp08 = *(aoffset1 + 7); | |||
ctemp09 = *(aoffset1 + 8); | |||
ctemp10 = *(aoffset1 + 9); | |||
ctemp11 = *(aoffset1 + 10); | |||
ctemp12 = *(aoffset1 + 11); | |||
ctemp13 = *(aoffset1 + 12); | |||
ctemp14 = *(aoffset1 + 13); | |||
ctemp15 = *(aoffset1 + 14); | |||
ctemp16 = *(aoffset1 + 15); | |||
ctemp17 = *(aoffset1 + 16); | |||
ctemp18 = *(aoffset1 + 17); | |||
ctemp19 = *(aoffset1 + 18); | |||
ctemp20 = *(aoffset1 + 19); | |||
ctemp21 = *(aoffset1 + 20); | |||
ctemp22 = *(aoffset1 + 21); | |||
ctemp23 = *(aoffset1 + 22); | |||
ctemp24 = *(aoffset1 + 23); | |||
ctemp25 = *(aoffset1 + 24); | |||
ctemp26 = *(aoffset1 + 25); | |||
ctemp27 = *(aoffset1 + 26); | |||
ctemp28 = *(aoffset1 + 27); | |||
ctemp29 = *(aoffset1 + 28); | |||
ctemp30 = *(aoffset1 + 29); | |||
ctemp31 = *(aoffset1 + 30); | |||
ctemp32 = *(aoffset1 + 31); | |||
*(boffset + 0) = -ctemp01; | |||
*(boffset + 1) = -ctemp02; | |||
*(boffset + 2) = -ctemp03; | |||
*(boffset + 3) = -ctemp04; | |||
*(boffset + 4) = -ctemp05; | |||
*(boffset + 5) = -ctemp06; | |||
*(boffset + 6) = -ctemp07; | |||
*(boffset + 7) = -ctemp08; | |||
*(boffset + 8) = -ctemp09; | |||
*(boffset + 9) = -ctemp10; | |||
*(boffset + 10) = -ctemp11; | |||
*(boffset + 11) = -ctemp12; | |||
*(boffset + 12) = -ctemp13; | |||
*(boffset + 13) = -ctemp14; | |||
*(boffset + 14) = -ctemp15; | |||
*(boffset + 15) = -ctemp16; | |||
*(boffset + 16) = -ctemp17; | |||
*(boffset + 17) = -ctemp18; | |||
*(boffset + 18) = -ctemp19; | |||
*(boffset + 19) = -ctemp20; | |||
*(boffset + 20) = -ctemp21; | |||
*(boffset + 21) = -ctemp22; | |||
*(boffset + 22) = -ctemp23; | |||
*(boffset + 23) = -ctemp24; | |||
*(boffset + 24) = -ctemp25; | |||
*(boffset + 25) = -ctemp26; | |||
*(boffset + 26) = -ctemp27; | |||
*(boffset + 27) = -ctemp28; | |||
*(boffset + 28) = -ctemp29; | |||
*(boffset + 29) = -ctemp30; | |||
*(boffset + 30) = -ctemp31; | |||
*(boffset + 31) = -ctemp32; | |||
boffset += 32; | |||
} | |||
j--; | |||
}while(j > 0); | |||
} /* end of if(j > 0) */ | |||
if (n & 8){ | |||
aoffset1 = aoffset; | |||
aoffset2 = aoffset + lda; | |||
aoffset += 16; | |||
i = (m >> 1); | |||
if (i > 0){ | |||
do{ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset1 + 1); | |||
ctemp03 = *(aoffset1 + 2); | |||
ctemp04 = *(aoffset1 + 3); | |||
ctemp05 = *(aoffset1 + 4); | |||
ctemp06 = *(aoffset1 + 5); | |||
ctemp07 = *(aoffset1 + 6); | |||
ctemp08 = *(aoffset1 + 7); | |||
ctemp09 = *(aoffset1 + 8); | |||
ctemp10 = *(aoffset1 + 9); | |||
ctemp11 = *(aoffset1 + 10); | |||
ctemp12 = *(aoffset1 + 11); | |||
ctemp13 = *(aoffset1 + 12); | |||
ctemp14 = *(aoffset1 + 13); | |||
ctemp15 = *(aoffset1 + 14); | |||
ctemp16 = *(aoffset1 + 15); | |||
ctemp17 = *(aoffset2 + 0); | |||
ctemp18 = *(aoffset2 + 1); | |||
ctemp19 = *(aoffset2 + 2); | |||
ctemp20 = *(aoffset2 + 3); | |||
ctemp21 = *(aoffset2 + 4); | |||
ctemp22 = *(aoffset2 + 5); | |||
ctemp23 = *(aoffset2 + 6); | |||
ctemp24 = *(aoffset2 + 7); | |||
ctemp25 = *(aoffset2 + 8); | |||
ctemp26 = *(aoffset2 + 9); | |||
ctemp27 = *(aoffset2 + 10); | |||
ctemp28 = *(aoffset2 + 11); | |||
ctemp29 = *(aoffset2 + 12); | |||
ctemp30 = *(aoffset2 + 13); | |||
ctemp31 = *(aoffset2 + 14); | |||
ctemp32 = *(aoffset2 + 15); | |||
*(boffset + 0) = -ctemp01; | |||
*(boffset + 1) = -ctemp02; | |||
*(boffset + 2) = -ctemp03; | |||
*(boffset + 3) = -ctemp04; | |||
*(boffset + 4) = -ctemp05; | |||
*(boffset + 5) = -ctemp06; | |||
*(boffset + 6) = -ctemp07; | |||
*(boffset + 7) = -ctemp08; | |||
*(boffset + 8) = -ctemp09; | |||
*(boffset + 9) = -ctemp10; | |||
*(boffset + 10) = -ctemp11; | |||
*(boffset + 11) = -ctemp12; | |||
*(boffset + 12) = -ctemp13; | |||
*(boffset + 13) = -ctemp14; | |||
*(boffset + 14) = -ctemp15; | |||
*(boffset + 15) = -ctemp16; | |||
*(boffset + 16) = -ctemp17; | |||
*(boffset + 17) = -ctemp18; | |||
*(boffset + 18) = -ctemp19; | |||
*(boffset + 19) = -ctemp20; | |||
*(boffset + 20) = -ctemp21; | |||
*(boffset + 21) = -ctemp22; | |||
*(boffset + 22) = -ctemp23; | |||
*(boffset + 23) = -ctemp24; | |||
*(boffset + 24) = -ctemp25; | |||
*(boffset + 25) = -ctemp26; | |||
*(boffset + 26) = -ctemp27; | |||
*(boffset + 27) = -ctemp28; | |||
*(boffset + 28) = -ctemp29; | |||
*(boffset + 29) = -ctemp30; | |||
*(boffset + 30) = -ctemp31; | |||
*(boffset + 31) = -ctemp32; | |||
aoffset1 += 2 * lda; | |||
aoffset2 += 2 * lda; | |||
boffset += 32; | |||
i --; | |||
}while(i > 0); | |||
} | |||
if (m & 1){ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset1 + 1); | |||
ctemp03 = *(aoffset1 + 2); | |||
ctemp04 = *(aoffset1 + 3); | |||
ctemp05 = *(aoffset1 + 4); | |||
ctemp06 = *(aoffset1 + 5); | |||
ctemp07 = *(aoffset1 + 6); | |||
ctemp08 = *(aoffset1 + 7); | |||
ctemp09 = *(aoffset1 + 8); | |||
ctemp10 = *(aoffset1 + 9); | |||
ctemp11 = *(aoffset1 + 10); | |||
ctemp12 = *(aoffset1 + 11); | |||
ctemp13 = *(aoffset1 + 12); | |||
ctemp14 = *(aoffset1 + 13); | |||
ctemp15 = *(aoffset1 + 14); | |||
ctemp16 = *(aoffset1 + 15); | |||
*(boffset + 0) = -ctemp01; | |||
*(boffset + 1) = -ctemp02; | |||
*(boffset + 2) = -ctemp03; | |||
*(boffset + 3) = -ctemp04; | |||
*(boffset + 4) = -ctemp05; | |||
*(boffset + 5) = -ctemp06; | |||
*(boffset + 6) = -ctemp07; | |||
*(boffset + 7) = -ctemp08; | |||
*(boffset + 8) = -ctemp09; | |||
*(boffset + 9) = -ctemp10; | |||
*(boffset + 10) = -ctemp11; | |||
*(boffset + 11) = -ctemp12; | |||
*(boffset + 12) = -ctemp13; | |||
*(boffset + 13) = -ctemp14; | |||
*(boffset + 14) = -ctemp15; | |||
*(boffset + 15) = -ctemp16; | |||
boffset += 16; | |||
} | |||
} | |||
if (n & 4){ | |||
aoffset1 = aoffset; | |||
aoffset2 = aoffset + lda; | |||
aoffset += 8; | |||
i = (m >> 1); | |||
if (i > 0){ | |||
do{ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset1 + 1); | |||
ctemp03 = *(aoffset1 + 2); | |||
ctemp04 = *(aoffset1 + 3); | |||
ctemp05 = *(aoffset1 + 4); | |||
ctemp06 = *(aoffset1 + 5); | |||
ctemp07 = *(aoffset1 + 6); | |||
ctemp08 = *(aoffset1 + 7); | |||
ctemp09 = *(aoffset2 + 0); | |||
ctemp10 = *(aoffset2 + 1); | |||
ctemp11 = *(aoffset2 + 2); | |||
ctemp12 = *(aoffset2 + 3); | |||
ctemp13 = *(aoffset2 + 4); | |||
ctemp14 = *(aoffset2 + 5); | |||
ctemp15 = *(aoffset2 + 6); | |||
ctemp16 = *(aoffset2 + 7); | |||
*(boffset + 0) = -ctemp01; | |||
*(boffset + 1) = -ctemp02; | |||
*(boffset + 2) = -ctemp03; | |||
*(boffset + 3) = -ctemp04; | |||
*(boffset + 4) = -ctemp05; | |||
*(boffset + 5) = -ctemp06; | |||
*(boffset + 6) = -ctemp07; | |||
*(boffset + 7) = -ctemp08; | |||
*(boffset + 8) = -ctemp09; | |||
*(boffset + 9) = -ctemp10; | |||
*(boffset + 10) = -ctemp11; | |||
*(boffset + 11) = -ctemp12; | |||
*(boffset + 12) = -ctemp13; | |||
*(boffset + 13) = -ctemp14; | |||
*(boffset + 14) = -ctemp15; | |||
*(boffset + 15) = -ctemp16; | |||
aoffset1 += 2 * lda; | |||
aoffset2 += 2 * lda; | |||
boffset += 16; | |||
i --; | |||
}while(i > 0); | |||
} | |||
if (m & 1){ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset1 + 1); | |||
ctemp03 = *(aoffset1 + 2); | |||
ctemp04 = *(aoffset1 + 3); | |||
ctemp05 = *(aoffset1 + 4); | |||
ctemp06 = *(aoffset1 + 5); | |||
ctemp07 = *(aoffset1 + 6); | |||
ctemp08 = *(aoffset1 + 7); | |||
*(boffset + 0) = -ctemp01; | |||
*(boffset + 1) = -ctemp02; | |||
*(boffset + 2) = -ctemp03; | |||
*(boffset + 3) = -ctemp04; | |||
*(boffset + 4) = -ctemp05; | |||
*(boffset + 5) = -ctemp06; | |||
*(boffset + 6) = -ctemp07; | |||
*(boffset + 7) = -ctemp08; | |||
boffset += 8; | |||
} | |||
} | |||
if (n & 2){ | |||
aoffset1 = aoffset; | |||
aoffset2 = aoffset + lda; | |||
aoffset += 4; | |||
i = (m >> 1); | |||
if (i > 0){ | |||
do{ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset1 + 1); | |||
ctemp03 = *(aoffset1 + 2); | |||
ctemp04 = *(aoffset1 + 3); | |||
ctemp05 = *(aoffset2 + 0); | |||
ctemp06 = *(aoffset2 + 1); | |||
ctemp07 = *(aoffset2 + 2); | |||
ctemp08 = *(aoffset2 + 3); | |||
*(boffset + 0) = -ctemp01; | |||
*(boffset + 1) = -ctemp02; | |||
*(boffset + 2) = -ctemp03; | |||
*(boffset + 3) = -ctemp04; | |||
*(boffset + 4) = -ctemp05; | |||
*(boffset + 5) = -ctemp06; | |||
*(boffset + 6) = -ctemp07; | |||
*(boffset + 7) = -ctemp08; | |||
aoffset1 += 2 * lda; | |||
aoffset2 += 2 * lda; | |||
boffset += 8; | |||
i --; | |||
}while(i > 0); | |||
} | |||
if (m & 1){ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset1 + 1); | |||
ctemp03 = *(aoffset1 + 2); | |||
ctemp04 = *(aoffset1 + 3); | |||
*(boffset + 0) = -ctemp01; | |||
*(boffset + 1) = -ctemp02; | |||
*(boffset + 2) = -ctemp03; | |||
*(boffset + 3) = -ctemp04; | |||
boffset += 4; | |||
} | |||
} | |||
if (n & 1){ | |||
aoffset1 = aoffset; | |||
aoffset2 = aoffset + lda; | |||
// aoffset += 2; | |||
i = (m >> 1); | |||
if (i > 0){ | |||
do{ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset1 + 1); | |||
ctemp03 = *(aoffset2 + 0); | |||
ctemp04 = *(aoffset2 + 1); | |||
*(boffset + 0) = -ctemp01; | |||
*(boffset + 1) = -ctemp02; | |||
*(boffset + 2) = -ctemp03; | |||
*(boffset + 3) = -ctemp04; | |||
aoffset1 += 2 * lda; | |||
aoffset2 += 2 * lda; | |||
boffset += 4; | |||
i --; | |||
}while(i > 0); | |||
} | |||
if (m & 1){ | |||
ctemp01 = *(aoffset1 + 0); | |||
ctemp02 = *(aoffset1 + 1); | |||
*(boffset + 0) = -ctemp01; | |||
*(boffset + 1) = -ctemp02; | |||
// boffset += 2; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,333 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, js, offset; | |||
FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
FLOAT data17, data18, data19, data20, data21, data22, data23, data24; | |||
FLOAT data25, data26, data27, data28, data29, data30, data31, data32; | |||
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; | |||
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; | |||
lda *= 2; | |||
js = (n >> 4); | |||
while (js > 0){ | |||
offset = posX - posY; | |||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; | |||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; | |||
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; | |||
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; | |||
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; | |||
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; | |||
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; | |||
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; | |||
if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda; | |||
if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda; | |||
if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda; | |||
if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda; | |||
if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda; | |||
if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda; | |||
if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda; | |||
if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda; | |||
i = m; | |||
while (i > 0) { | |||
data01 = *(ao1 + 0); | |||
data02 = *(ao1 + 1); | |||
data03 = *(ao2 + 0); | |||
data04 = *(ao2 + 1); | |||
data05 = *(ao3 + 0); | |||
data06 = *(ao3 + 1); | |||
data07 = *(ao4 + 0); | |||
data08 = *(ao4 + 1); | |||
data09 = *(ao5 + 0); | |||
data10 = *(ao5 + 1); | |||
data11 = *(ao6 + 0); | |||
data12 = *(ao6 + 1); | |||
data13 = *(ao7 + 0); | |||
data14 = *(ao7 + 1); | |||
data15 = *(ao8 + 0); | |||
data16 = *(ao8 + 1); | |||
data17 = *(ao9 + 0); | |||
data18 = *(ao9 + 1); | |||
data19 = *(ao10 + 0); | |||
data20 = *(ao10 + 1); | |||
data21 = *(ao11 + 0); | |||
data22 = *(ao11 + 1); | |||
data23 = *(ao12 + 0); | |||
data24 = *(ao12 + 1); | |||
data25 = *(ao13 + 0); | |||
data26 = *(ao13 + 1); | |||
data27 = *(ao14 + 0); | |||
data28 = *(ao14 + 1); | |||
data29 = *(ao15 + 0); | |||
data30 = *(ao15 + 1); | |||
data31 = *(ao16 + 0); | |||
data32 = *(ao16 + 1); | |||
if (offset > 0) ao1 += lda; else ao1 += 2; | |||
if (offset > -1) ao2 += lda; else ao2 += 2; | |||
if (offset > -2) ao3 += lda; else ao3 += 2; | |||
if (offset > -3) ao4 += lda; else ao4 += 2; | |||
if (offset > -4) ao5 += lda; else ao5 += 2; | |||
if (offset > -5) ao6 += lda; else ao6 += 2; | |||
if (offset > -6) ao7 += lda; else ao7 += 2; | |||
if (offset > -7) ao8 += lda; else ao8 += 2; | |||
if (offset > -8) ao9 += lda; else ao9 += 2; | |||
if (offset > -9) ao10 += lda; else ao10 += 2; | |||
if (offset > -10) ao11 += lda; else ao11 += 2; | |||
if (offset > -11) ao12 += lda; else ao12 += 2; | |||
if (offset > -12) ao13 += lda; else ao13 += 2; | |||
if (offset > -13) ao14 += lda; else ao14 += 2; | |||
if (offset > -14) ao15 += lda; else ao15 += 2; | |||
if (offset > -15) ao16 += lda; else ao16 += 2; | |||
b[ 0] = data01; | |||
b[ 1] = data02; | |||
b[ 2] = data03; | |||
b[ 3] = data04; | |||
b[ 4] = data05; | |||
b[ 5] = data06; | |||
b[ 6] = data07; | |||
b[ 7] = data08; | |||
b[ 8] = data09; | |||
b[ 9] = data10; | |||
b[10] = data11; | |||
b[11] = data12; | |||
b[12] = data13; | |||
b[13] = data14; | |||
b[14] = data15; | |||
b[15] = data16; | |||
b[16] = data17; | |||
b[17] = data18; | |||
b[18] = data19; | |||
b[19] = data20; | |||
b[20] = data21; | |||
b[21] = data22; | |||
b[22] = data23; | |||
b[23] = data24; | |||
b[24] = data25; | |||
b[25] = data26; | |||
b[26] = data27; | |||
b[27] = data28; | |||
b[28] = data29; | |||
b[29] = data30; | |||
b[30] = data31; | |||
b[31] = data32; | |||
b += 32; | |||
offset --; | |||
i --; | |||
} | |||
posX += 16; | |||
js --; | |||
} | |||
if (n & 8) { | |||
offset = posX - posY; | |||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; | |||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; | |||
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; | |||
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; | |||
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; | |||
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; | |||
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; | |||
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; | |||
i = m; | |||
while (i > 0) { | |||
data01 = *(ao1 + 0); | |||
data02 = *(ao1 + 1); | |||
data03 = *(ao2 + 0); | |||
data04 = *(ao2 + 1); | |||
data05 = *(ao3 + 0); | |||
data06 = *(ao3 + 1); | |||
data07 = *(ao4 + 0); | |||
data08 = *(ao4 + 1); | |||
data09 = *(ao5 + 0); | |||
data10 = *(ao5 + 1); | |||
data11 = *(ao6 + 0); | |||
data12 = *(ao6 + 1); | |||
data13 = *(ao7 + 0); | |||
data14 = *(ao7 + 1); | |||
data15 = *(ao8 + 0); | |||
data16 = *(ao8 + 1); | |||
if (offset > 0) ao1 += lda; else ao1 += 2; | |||
if (offset > -1) ao2 += lda; else ao2 += 2; | |||
if (offset > -2) ao3 += lda; else ao3 += 2; | |||
if (offset > -3) ao4 += lda; else ao4 += 2; | |||
if (offset > -4) ao5 += lda; else ao5 += 2; | |||
if (offset > -5) ao6 += lda; else ao6 += 2; | |||
if (offset > -6) ao7 += lda; else ao7 += 2; | |||
if (offset > -7) ao8 += lda; else ao8 += 2; | |||
b[ 0] = data01; | |||
b[ 1] = data02; | |||
b[ 2] = data03; | |||
b[ 3] = data04; | |||
b[ 4] = data05; | |||
b[ 5] = data06; | |||
b[ 6] = data07; | |||
b[ 7] = data08; | |||
b[ 8] = data09; | |||
b[ 9] = data10; | |||
b[10] = data11; | |||
b[11] = data12; | |||
b[12] = data13; | |||
b[13] = data14; | |||
b[14] = data15; | |||
b[15] = data16; | |||
b += 16; | |||
offset --; | |||
i --; | |||
} | |||
posX += 8; | |||
} | |||
if (n & 4) { | |||
offset = posX - posY; | |||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; | |||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; | |||
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; | |||
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; | |||
i = m; | |||
while (i > 0) { | |||
data01 = *(ao1 + 0); | |||
data02 = *(ao1 + 1); | |||
data03 = *(ao2 + 0); | |||
data04 = *(ao2 + 1); | |||
data05 = *(ao3 + 0); | |||
data06 = *(ao3 + 1); | |||
data07 = *(ao4 + 0); | |||
data08 = *(ao4 + 1); | |||
if (offset > 0) ao1 += lda; else ao1 += 2; | |||
if (offset > -1) ao2 += lda; else ao2 += 2; | |||
if (offset > -2) ao3 += lda; else ao3 += 2; | |||
if (offset > -3) ao4 += lda; else ao4 += 2; | |||
b[ 0] = data01; | |||
b[ 1] = data02; | |||
b[ 2] = data03; | |||
b[ 3] = data04; | |||
b[ 4] = data05; | |||
b[ 5] = data06; | |||
b[ 6] = data07; | |||
b[ 7] = data08; | |||
b += 8; | |||
offset --; | |||
i --; | |||
} | |||
posX += 4; | |||
} | |||
if (n & 2) { | |||
offset = posX - posY; | |||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; | |||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; | |||
i = m; | |||
while (i > 0) { | |||
data01 = *(ao1 + 0); | |||
data02 = *(ao1 + 1); | |||
data03 = *(ao2 + 0); | |||
data04 = *(ao2 + 1); | |||
if (offset > 0) ao1 += lda; else ao1 += 2; | |||
if (offset > -1) ao2 += lda; else ao2 += 2; | |||
b[ 0] = data01; | |||
b[ 1] = data02; | |||
b[ 2] = data03; | |||
b[ 3] = data04; | |||
b += 4; | |||
offset --; | |||
i --; | |||
} | |||
posX += 2; | |||
} | |||
if (n & 1) { | |||
offset = posX - posY; | |||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; | |||
i = m; | |||
while (i > 0) { | |||
data01 = *(ao1 + 0); | |||
data02 = *(ao1 + 1); | |||
if (offset > 0) ao1 += lda; else ao1 += 2; | |||
b[ 0] = data01; | |||
b[ 1] = data02; | |||
b += 2; | |||
offset --; | |||
i --; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,332 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, js, offset; | |||
FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
FLOAT data17, data18, data19, data20, data21, data22, data23, data24; | |||
FLOAT data25, data26, data27, data28, data29, data30, data31, data32; | |||
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; | |||
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; | |||
lda *= 2; | |||
js = (n >> 4); | |||
while (js > 0){ | |||
offset = posX - posY; | |||
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; | |||
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; | |||
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; | |||
if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; | |||
if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; | |||
if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; | |||
if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; | |||
if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; | |||
if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda; | |||
if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda; | |||
if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda; | |||
if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda; | |||
if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda; | |||
if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda; | |||
if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda; | |||
if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda; | |||
i = m; | |||
while (i > 0) { | |||
data01 = *(ao1 + 0); | |||
data02 = *(ao1 + 1); | |||
data03 = *(ao2 + 0); | |||
data04 = *(ao2 + 1); | |||
data05 = *(ao3 + 0); | |||
data06 = *(ao3 + 1); | |||
data07 = *(ao4 + 0); | |||
data08 = *(ao4 + 1); | |||
data09 = *(ao5 + 0); | |||
data10 = *(ao5 + 1); | |||
data11 = *(ao6 + 0); | |||
data12 = *(ao6 + 1); | |||
data13 = *(ao7 + 0); | |||
data14 = *(ao7 + 1); | |||
data15 = *(ao8 + 0); | |||
data16 = *(ao8 + 1); | |||
data17 = *(ao9 + 0); | |||
data18 = *(ao9 + 1); | |||
data19 = *(ao10 + 0); | |||
data20 = *(ao10 + 1); | |||
data21 = *(ao11 + 0); | |||
data22 = *(ao11 + 1); | |||
data23 = *(ao12 + 0); | |||
data24 = *(ao12 + 1); | |||
data25 = *(ao13 + 0); | |||
data26 = *(ao13 + 1); | |||
data27 = *(ao14 + 0); | |||
data28 = *(ao14 + 1); | |||
data29 = *(ao15 + 0); | |||
data30 = *(ao15 + 1); | |||
data31 = *(ao16 + 0); | |||
data32 = *(ao16 + 1); | |||
if (offset > 0) ao1 += 2; else ao1 += lda; | |||
if (offset > -1) ao2 += 2; else ao2 += lda; | |||
if (offset > -2) ao3 += 2; else ao3 += lda; | |||
if (offset > -3) ao4 += 2; else ao4 += lda; | |||
if (offset > -4) ao5 += 2; else ao5 += lda; | |||
if (offset > -5) ao6 += 2; else ao6 += lda; | |||
if (offset > -6) ao7 += 2; else ao7 += lda; | |||
if (offset > -7) ao8 += 2; else ao8 += lda; | |||
if (offset > -8) ao9 += 2; else ao9 += lda; | |||
if (offset > -9) ao10 += 2; else ao10 += lda; | |||
if (offset > -10) ao11 += 2; else ao11 += lda; | |||
if (offset > -11) ao12 += 2; else ao12 += lda; | |||
if (offset > -12) ao13 += 2; else ao13 += lda; | |||
if (offset > -13) ao14 += 2; else ao14 += lda; | |||
if (offset > -14) ao15 += 2; else ao15 += lda; | |||
if (offset > -15) ao16 += 2; else ao16 += lda; | |||
b[ 0] = data01; | |||
b[ 1] = data02; | |||
b[ 2] = data03; | |||
b[ 3] = data04; | |||
b[ 4] = data05; | |||
b[ 5] = data06; | |||
b[ 6] = data07; | |||
b[ 7] = data08; | |||
b[ 8] = data09; | |||
b[ 9] = data10; | |||
b[10] = data11; | |||
b[11] = data12; | |||
b[12] = data13; | |||
b[13] = data14; | |||
b[14] = data15; | |||
b[15] = data16; | |||
b[16] = data17; | |||
b[17] = data18; | |||
b[18] = data19; | |||
b[19] = data20; | |||
b[20] = data21; | |||
b[21] = data22; | |||
b[22] = data23; | |||
b[23] = data24; | |||
b[24] = data25; | |||
b[25] = data26; | |||
b[26] = data27; | |||
b[27] = data28; | |||
b[28] = data29; | |||
b[29] = data30; | |||
b[30] = data31; | |||
b[31] = data32; | |||
b += 32; | |||
offset --; | |||
i --; | |||
} | |||
posX += 16; | |||
js --; | |||
} | |||
if (n & 8) { | |||
offset = posX - posY; | |||
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; | |||
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; | |||
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; | |||
if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; | |||
if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; | |||
if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; | |||
if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; | |||
if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; | |||
i = m; | |||
while (i > 0) { | |||
data01 = *(ao1 + 0); | |||
data02 = *(ao1 + 1); | |||
data03 = *(ao2 + 0); | |||
data04 = *(ao2 + 1); | |||
data05 = *(ao3 + 0); | |||
data06 = *(ao3 + 1); | |||
data07 = *(ao4 + 0); | |||
data08 = *(ao4 + 1); | |||
data09 = *(ao5 + 0); | |||
data10 = *(ao5 + 1); | |||
data11 = *(ao6 + 0); | |||
data12 = *(ao6 + 1); | |||
data13 = *(ao7 + 0); | |||
data14 = *(ao7 + 1); | |||
data15 = *(ao8 + 0); | |||
data16 = *(ao8 + 1); | |||
if (offset > 0) ao1 += 2; else ao1 += lda; | |||
if (offset > -1) ao2 += 2; else ao2 += lda; | |||
if (offset > -2) ao3 += 2; else ao3 += lda; | |||
if (offset > -3) ao4 += 2; else ao4 += lda; | |||
if (offset > -4) ao5 += 2; else ao5 += lda; | |||
if (offset > -5) ao6 += 2; else ao6 += lda; | |||
if (offset > -6) ao7 += 2; else ao7 += lda; | |||
if (offset > -7) ao8 += 2; else ao8 += lda; | |||
b[ 0] = data01; | |||
b[ 1] = data02; | |||
b[ 2] = data03; | |||
b[ 3] = data04; | |||
b[ 4] = data05; | |||
b[ 5] = data06; | |||
b[ 6] = data07; | |||
b[ 7] = data08; | |||
b[ 8] = data09; | |||
b[ 9] = data10; | |||
b[10] = data11; | |||
b[11] = data12; | |||
b[12] = data13; | |||
b[13] = data14; | |||
b[14] = data15; | |||
b[15] = data16; | |||
b += 16; | |||
offset --; | |||
i --; | |||
} | |||
posX += 8; | |||
} | |||
if (n & 4) { | |||
offset = posX - posY; | |||
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; | |||
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; | |||
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; | |||
if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; | |||
i = m; | |||
while (i > 0) { | |||
data01 = *(ao1 + 0); | |||
data02 = *(ao1 + 1); | |||
data03 = *(ao2 + 0); | |||
data04 = *(ao2 + 1); | |||
data05 = *(ao3 + 0); | |||
data06 = *(ao3 + 1); | |||
data07 = *(ao4 + 0); | |||
data08 = *(ao4 + 1); | |||
if (offset > 0) ao1 += 2; else ao1 += lda; | |||
if (offset > -1) ao2 += 2; else ao2 += lda; | |||
if (offset > -2) ao3 += 2; else ao3 += lda; | |||
if (offset > -3) ao4 += 2; else ao4 += lda; | |||
b[ 0] = data01; | |||
b[ 1] = data02; | |||
b[ 2] = data03; | |||
b[ 3] = data04; | |||
b[ 4] = data05; | |||
b[ 5] = data06; | |||
b[ 6] = data07; | |||
b[ 7] = data08; | |||
b += 8; | |||
offset --; | |||
i --; | |||
} | |||
posX += 4; | |||
} | |||
if (n & 2) { | |||
offset = posX - posY; | |||
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; | |||
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; | |||
i = m; | |||
while (i > 0) { | |||
data01 = *(ao1 + 0); | |||
data02 = *(ao1 + 1); | |||
data03 = *(ao2 + 0); | |||
data04 = *(ao2 + 1); | |||
if (offset > 0) ao1 += 2; else ao1 += lda; | |||
if (offset > -1) ao2 += 2; else ao2 += lda; | |||
b[ 0] = data01; | |||
b[ 1] = data02; | |||
b[ 2] = data03; | |||
b[ 3] = data04; | |||
b += 4; | |||
offset --; | |||
i --; | |||
} | |||
posX += 2; | |||
} | |||
if (n & 1) { | |||
offset = posX - posY; | |||
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; | |||
i = m; | |||
while (i > 0) { | |||
data01 = *(ao1 + 0); | |||
data02 = *(ao1 + 1); | |||
if (offset > 0) ao1 += 2; else ao1 += lda; | |||
b[ 0] = data01; | |||
b[ 1] = data02; | |||
b += 2; | |||
offset --; | |||
i --; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,308 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, j, jj, k; | |||
FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; | |||
FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; | |||
FLOAT data1, data2; | |||
lda *= 2; | |||
jj = offset; | |||
j = (n >> 4); | |||
while (j > 0){ | |||
a1 = a + 0 * lda; | |||
a2 = a + 1 * lda; | |||
a3 = a + 2 * lda; | |||
a4 = a + 3 * lda; | |||
a5 = a + 4 * lda; | |||
a6 = a + 5 * lda; | |||
a7 = a + 6 * lda; | |||
a8 = a + 7 * lda; | |||
a9 = a + 8 * lda; | |||
a10 = a + 9 * lda; | |||
a11 = a + 10 * lda; | |||
a12 = a + 11 * lda; | |||
a13 = a + 12 * lda; | |||
a14 = a + 13 * lda; | |||
a15 = a + 14 * lda; | |||
a16 = a + 15 * lda; | |||
a += 16 * lda; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 16)) { | |||
for (k = 0; k < ii - jj; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * lda + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * lda + 1); | |||
} | |||
data1 = *(a1 + (ii - jj) * lda + 0); | |||
data2 = *(a1 + (ii - jj) * lda + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj >= 16) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a2 + 0); | |||
*(b + 3) = *(a2 + 1); | |||
*(b + 4) = *(a3 + 0); | |||
*(b + 5) = *(a3 + 1); | |||
*(b + 6) = *(a4 + 0); | |||
*(b + 7) = *(a4 + 1); | |||
*(b + 8) = *(a5 + 0); | |||
*(b + 9) = *(a5 + 1); | |||
*(b + 10) = *(a6 + 0); | |||
*(b + 11) = *(a6 + 1); | |||
*(b + 12) = *(a7 + 0); | |||
*(b + 13) = *(a7 + 1); | |||
*(b + 14) = *(a8 + 0); | |||
*(b + 15) = *(a8 + 1); | |||
*(b + 16) = *(a9 + 0); | |||
*(b + 17) = *(a9 + 1); | |||
*(b + 18) = *(a10 + 0); | |||
*(b + 19) = *(a10 + 1); | |||
*(b + 20) = *(a11 + 0); | |||
*(b + 21) = *(a11 + 1); | |||
*(b + 22) = *(a12 + 0); | |||
*(b + 23) = *(a12 + 1); | |||
*(b + 24) = *(a13 + 0); | |||
*(b + 25) = *(a13 + 1); | |||
*(b + 26) = *(a14 + 0); | |||
*(b + 27) = *(a14 + 1); | |||
*(b + 28) = *(a15 + 0); | |||
*(b + 29) = *(a15 + 1); | |||
*(b + 30) = *(a16 + 0); | |||
*(b + 31) = *(a16 + 1); | |||
} | |||
a1 += 2; | |||
a2 += 2; | |||
a3 += 2; | |||
a4 += 2; | |||
a5 += 2; | |||
a6 += 2; | |||
a7 += 2; | |||
a8 += 2; | |||
a9 += 2; | |||
a10 += 2; | |||
a11 += 2; | |||
a12 += 2; | |||
a13 += 2; | |||
a14 += 2; | |||
a15 += 2; | |||
a16 += 2; | |||
b += 32; | |||
ii ++; | |||
} | |||
jj += 16; | |||
j --; | |||
} | |||
if (n & 8) { | |||
a1 = a + 0 * lda; | |||
a2 = a + 1 * lda; | |||
a3 = a + 2 * lda; | |||
a4 = a + 3 * lda; | |||
a5 = a + 4 * lda; | |||
a6 = a + 5 * lda; | |||
a7 = a + 6 * lda; | |||
a8 = a + 7 * lda; | |||
a += 8 * lda; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 8)) { | |||
for (k = 0; k < ii - jj; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * lda + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * lda + 1); | |||
} | |||
data1 = *(a1 + (ii - jj) * lda + 0); | |||
data2 = *(a1 + (ii - jj) * lda + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj >= 8) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a2 + 0); | |||
*(b + 3) = *(a2 + 1); | |||
*(b + 4) = *(a3 + 0); | |||
*(b + 5) = *(a3 + 1); | |||
*(b + 6) = *(a4 + 0); | |||
*(b + 7) = *(a4 + 1); | |||
*(b + 8) = *(a5 + 0); | |||
*(b + 9) = *(a5 + 1); | |||
*(b + 10) = *(a6 + 0); | |||
*(b + 11) = *(a6 + 1); | |||
*(b + 12) = *(a7 + 0); | |||
*(b + 13) = *(a7 + 1); | |||
*(b + 14) = *(a8 + 0); | |||
*(b + 15) = *(a8 + 1); | |||
} | |||
a1 += 2; | |||
a2 += 2; | |||
a3 += 2; | |||
a4 += 2; | |||
a5 += 2; | |||
a6 += 2; | |||
a7 += 2; | |||
a8 += 2; | |||
b += 16; | |||
ii ++; | |||
} | |||
jj += 8; | |||
} | |||
if (n & 4) { | |||
a1 = a + 0 * lda; | |||
a2 = a + 1 * lda; | |||
a3 = a + 2 * lda; | |||
a4 = a + 3 * lda; | |||
a += 4 * lda; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 4)) { | |||
for (k = 0; k < ii - jj; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * lda + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * lda + 1); | |||
} | |||
data1 = *(a1 + (ii - jj) * lda + 0); | |||
data2 = *(a1 + (ii - jj) * lda + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj >= 4) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a2 + 0); | |||
*(b + 3) = *(a2 + 1); | |||
*(b + 4) = *(a3 + 0); | |||
*(b + 5) = *(a3 + 1); | |||
*(b + 6) = *(a4 + 0); | |||
*(b + 7) = *(a4 + 1); | |||
} | |||
a1 += 2; | |||
a2 += 2; | |||
a3 += 2; | |||
a4 += 2; | |||
b += 8; | |||
ii ++; | |||
} | |||
jj += 4; | |||
} | |||
if (n & 2) { | |||
a1 = a + 0 * lda; | |||
a2 = a + 1 * lda; | |||
a += 2 * lda; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 2)) { | |||
for (k = 0; k < ii - jj; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * lda + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * lda + 1); | |||
} | |||
data1 = *(a1 + (ii - jj) * lda + 0); | |||
data2 = *(a1 + (ii - jj) * lda + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj >= 2) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a2 + 0); | |||
*(b + 3) = *(a2 + 1); | |||
} | |||
a1 += 2; | |||
a2 += 2; | |||
b += 4; | |||
ii ++; | |||
} | |||
jj += 2; | |||
} | |||
if (n & 1) { | |||
a1 = a + 0 * lda; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 1)) { | |||
for (k = 0; k < ii - jj; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * lda + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * lda + 1); | |||
} | |||
data1 = *(a1 + (ii - jj) * lda + 0); | |||
data2 = *(a1 + (ii - jj) * lda + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj >= 1) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
} | |||
a1 += 2; | |||
b += 2; | |||
ii ++; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,264 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, j, jj, k; | |||
FLOAT *a1; | |||
FLOAT data1, data2; | |||
lda *= 2; | |||
jj = offset; | |||
j = (n >> 4); | |||
while (j > 0){ | |||
a1 = a; | |||
a += 32; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 16)) { | |||
data1 = *(a1 + (ii - jj) * 2 + 0); | |||
data2 = *(a1 + (ii - jj) * 2 + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
for (k = ii - jj + 1; k < 16; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1); | |||
} | |||
} | |||
if (ii - jj < 0) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a1 + 2); | |||
*(b + 3) = *(a1 + 3); | |||
*(b + 4) = *(a1 + 4); | |||
*(b + 5) = *(a1 + 5); | |||
*(b + 6) = *(a1 + 6); | |||
*(b + 7) = *(a1 + 7); | |||
*(b + 8) = *(a1 + 8); | |||
*(b + 9) = *(a1 + 9); | |||
*(b + 10) = *(a1 + 10); | |||
*(b + 11) = *(a1 + 11); | |||
*(b + 12) = *(a1 + 12); | |||
*(b + 13) = *(a1 + 13); | |||
*(b + 14) = *(a1 + 14); | |||
*(b + 15) = *(a1 + 15); | |||
*(b + 16) = *(a1 + 16); | |||
*(b + 17) = *(a1 + 17); | |||
*(b + 18) = *(a1 + 18); | |||
*(b + 19) = *(a1 + 19); | |||
*(b + 20) = *(a1 + 20); | |||
*(b + 21) = *(a1 + 21); | |||
*(b + 22) = *(a1 + 22); | |||
*(b + 23) = *(a1 + 23); | |||
*(b + 24) = *(a1 + 24); | |||
*(b + 25) = *(a1 + 25); | |||
*(b + 26) = *(a1 + 26); | |||
*(b + 27) = *(a1 + 27); | |||
*(b + 28) = *(a1 + 28); | |||
*(b + 29) = *(a1 + 29); | |||
*(b + 30) = *(a1 + 30); | |||
*(b + 31) = *(a1 + 31); | |||
} | |||
b += 32; | |||
a1 += lda; | |||
ii ++; | |||
} | |||
jj += 16; | |||
j --; | |||
} | |||
j = (n & 8); | |||
if (j > 0) { | |||
a1 = a; | |||
a += 16; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 8)) { | |||
data1 = *(a1 + (ii - jj) * 2 + 0); | |||
data2 = *(a1 + (ii - jj) * 2 + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
for (k = ii - jj + 1; k < 8; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1); | |||
} | |||
} | |||
if (ii - jj < 0) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a1 + 2); | |||
*(b + 3) = *(a1 + 3); | |||
*(b + 4) = *(a1 + 4); | |||
*(b + 5) = *(a1 + 5); | |||
*(b + 6) = *(a1 + 6); | |||
*(b + 7) = *(a1 + 7); | |||
*(b + 8) = *(a1 + 8); | |||
*(b + 9) = *(a1 + 9); | |||
*(b + 10) = *(a1 + 10); | |||
*(b + 11) = *(a1 + 11); | |||
*(b + 12) = *(a1 + 12); | |||
*(b + 13) = *(a1 + 13); | |||
*(b + 14) = *(a1 + 14); | |||
*(b + 15) = *(a1 + 15); | |||
} | |||
b += 16; | |||
a1 += lda; | |||
ii ++; | |||
} | |||
jj += 8; | |||
} | |||
j = (n & 4); | |||
if (j > 0) { | |||
a1 = a; | |||
a += 8; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 4)) { | |||
data1 = *(a1 + (ii - jj) * 2 + 0); | |||
data2 = *(a1 + (ii - jj) * 2 + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
for (k = ii - jj + 1; k < 4; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1); | |||
} | |||
} | |||
if (ii - jj < 0) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a1 + 2); | |||
*(b + 3) = *(a1 + 3); | |||
*(b + 4) = *(a1 + 4); | |||
*(b + 5) = *(a1 + 5); | |||
*(b + 6) = *(a1 + 6); | |||
*(b + 7) = *(a1 + 7); | |||
} | |||
b += 8; | |||
a1 += lda; | |||
ii ++; | |||
} | |||
jj += 4; | |||
} | |||
j = (n & 2); | |||
if (j > 0) { | |||
a1 = a; | |||
a += 4; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 2)) { | |||
data1 = *(a1 + (ii - jj) * 2 + 0); | |||
data2 = *(a1 + (ii - jj) * 2 + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
for (k = ii - jj + 1; k < 2; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1); | |||
} | |||
} | |||
if (ii - jj < 0) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a1 + 2); | |||
*(b + 3) = *(a1 + 3); | |||
} | |||
b += 4; | |||
a1 += lda; | |||
ii ++; | |||
} | |||
jj += 2; | |||
} | |||
j = (n & 1); | |||
if (j > 0) { | |||
a1 = a; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 1)) { | |||
data1 = *(a1 + (ii - jj) * 2 + 0); | |||
data2 = *(a1 + (ii - jj) * 2 + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj < 0) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
} | |||
b += 2; | |||
a1 += lda; | |||
ii ++; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,313 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, j, jj, k; | |||
FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; | |||
FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; | |||
FLOAT data1, data2; | |||
lda *= 2; | |||
jj = offset; | |||
j = (n >> 4); | |||
while (j > 0){ | |||
a1 = a + 0 * lda; | |||
a2 = a + 1 * lda; | |||
a3 = a + 2 * lda; | |||
a4 = a + 3 * lda; | |||
a5 = a + 4 * lda; | |||
a6 = a + 5 * lda; | |||
a7 = a + 6 * lda; | |||
a8 = a + 7 * lda; | |||
a9 = a + 8 * lda; | |||
a10 = a + 9 * lda; | |||
a11 = a + 10 * lda; | |||
a12 = a + 11 * lda; | |||
a13 = a + 12 * lda; | |||
a14 = a + 13 * lda; | |||
a15 = a + 14 * lda; | |||
a16 = a + 15 * lda; | |||
a += 16 * lda; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 16)) { | |||
data1 = *(a1 + (ii - jj) * lda + 0); | |||
data2 = *(a1 + (ii - jj) * lda + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
for (k = ii - jj + 1; k < 16; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * lda + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * lda + 1); | |||
} | |||
} | |||
if (ii - jj < 0) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a2 + 0); | |||
*(b + 3) = *(a2 + 1); | |||
*(b + 4) = *(a3 + 0); | |||
*(b + 5) = *(a3 + 1); | |||
*(b + 6) = *(a4 + 0); | |||
*(b + 7) = *(a4 + 1); | |||
*(b + 8) = *(a5 + 0); | |||
*(b + 9) = *(a5 + 1); | |||
*(b + 10) = *(a6 + 0); | |||
*(b + 11) = *(a6 + 1); | |||
*(b + 12) = *(a7 + 0); | |||
*(b + 13) = *(a7 + 1); | |||
*(b + 14) = *(a8 + 0); | |||
*(b + 15) = *(a8 + 1); | |||
*(b + 16) = *(a9 + 0); | |||
*(b + 17) = *(a9 + 1); | |||
*(b + 18) = *(a10 + 0); | |||
*(b + 19) = *(a10 + 1); | |||
*(b + 20) = *(a11 + 0); | |||
*(b + 21) = *(a11 + 1); | |||
*(b + 22) = *(a12 + 0); | |||
*(b + 23) = *(a12 + 1); | |||
*(b + 24) = *(a13 + 0); | |||
*(b + 25) = *(a13 + 1); | |||
*(b + 26) = *(a14 + 0); | |||
*(b + 27) = *(a14 + 1); | |||
*(b + 28) = *(a15 + 0); | |||
*(b + 29) = *(a15 + 1); | |||
*(b + 30) = *(a16 + 0); | |||
*(b + 31) = *(a16 + 1); | |||
} | |||
a1 += 2; | |||
a2 += 2; | |||
a3 += 2; | |||
a4 += 2; | |||
a5 += 2; | |||
a6 += 2; | |||
a7 += 2; | |||
a8 += 2; | |||
a9 += 2; | |||
a10 += 2; | |||
a11 += 2; | |||
a12 += 2; | |||
a13 += 2; | |||
a14 += 2; | |||
a15 += 2; | |||
a16 += 2; | |||
b += 32; | |||
ii ++; | |||
} | |||
jj += 16; | |||
j --; | |||
} | |||
if (n & 8) { | |||
a1 = a + 0 * lda; | |||
a2 = a + 1 * lda; | |||
a3 = a + 2 * lda; | |||
a4 = a + 3 * lda; | |||
a5 = a + 4 * lda; | |||
a6 = a + 5 * lda; | |||
a7 = a + 6 * lda; | |||
a8 = a + 7 * lda; | |||
a += 8 * lda; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 8)) { | |||
data1 = *(a1 + (ii - jj) * lda + 0); | |||
data2 = *(a1 + (ii - jj) * lda + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
for (k = ii - jj + 1; k < 8; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * lda + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * lda + 1); | |||
} | |||
} | |||
if (ii - jj < 0) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a2 + 0); | |||
*(b + 3) = *(a2 + 1); | |||
*(b + 4) = *(a3 + 0); | |||
*(b + 5) = *(a3 + 1); | |||
*(b + 6) = *(a4 + 0); | |||
*(b + 7) = *(a4 + 1); | |||
*(b + 8) = *(a5 + 0); | |||
*(b + 9) = *(a5 + 1); | |||
*(b + 10) = *(a6 + 0); | |||
*(b + 11) = *(a6 + 1); | |||
*(b + 12) = *(a7 + 0); | |||
*(b + 13) = *(a7 + 1); | |||
*(b + 14) = *(a8 + 0); | |||
*(b + 15) = *(a8 + 1); | |||
} | |||
a1 += 2; | |||
a2 += 2; | |||
a3 += 2; | |||
a4 += 2; | |||
a5 += 2; | |||
a6 += 2; | |||
a7 += 2; | |||
a8 += 2; | |||
b += 16; | |||
ii ++; | |||
} | |||
jj += 8; | |||
} | |||
if (n & 4) { | |||
a1 = a + 0 * lda; | |||
a2 = a + 1 * lda; | |||
a3 = a + 2 * lda; | |||
a4 = a + 3 * lda; | |||
a += 4 * lda; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 4)) { | |||
data1 = *(a1 + (ii - jj) * lda + 0); | |||
data2 = *(a1 + (ii - jj) * lda + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
for (k = ii - jj + 1; k < 4; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * lda + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * lda + 1); | |||
} | |||
} | |||
if (ii - jj < 0) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a2 + 0); | |||
*(b + 3) = *(a2 + 1); | |||
*(b + 4) = *(a3 + 0); | |||
*(b + 5) = *(a3 + 1); | |||
*(b + 6) = *(a4 + 0); | |||
*(b + 7) = *(a4 + 1); | |||
} | |||
a1 += 2; | |||
a2 += 2; | |||
a3 += 2; | |||
a4 += 2; | |||
b += 8; | |||
ii ++; | |||
} | |||
jj += 4; | |||
} | |||
if (n & 2) { | |||
a1 = a + 0 * lda; | |||
a2 = a + 1 * lda; | |||
a += 2 * lda; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 2)) { | |||
data1 = *(a1 + (ii - jj) * lda + 0); | |||
data2 = *(a1 + (ii - jj) * lda + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
for (k = ii - jj + 1; k < 2; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * lda + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * lda + 1); | |||
} | |||
} | |||
if (ii - jj < 0) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a2 + 0); | |||
*(b + 3) = *(a2 + 1); | |||
} | |||
a1 += 2; | |||
a2 += 2; | |||
b += 4; | |||
ii ++; | |||
} | |||
jj += 2; | |||
} | |||
if (n & 1) { | |||
a1 = a + 0 * lda; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 1)) { | |||
data1 = *(a1 + (ii - jj) * lda + 0); | |||
data2 = *(a1 + (ii - jj) * lda + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
for (k = ii - jj + 1; k < 1; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * lda + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * lda + 1); | |||
} | |||
} | |||
if (ii - jj < 0) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
} | |||
a1 += 2; | |||
b += 2; | |||
ii ++; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,261 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
BLASLONG i, ii, j, jj, k; | |||
FLOAT *a1, data1, data2; | |||
lda *= 2; | |||
jj = offset; | |||
j = (n >> 4); | |||
while (j > 0){ | |||
a1 = a; | |||
a += 32; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 16)) { | |||
for (k = 0; k < ii - jj; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1); | |||
} | |||
data1 = *(a1 + (ii - jj) * 2 + 0); | |||
data2 = *(a1 + (ii - jj) * 2 + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj >= 16) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a1 + 2); | |||
*(b + 3) = *(a1 + 3); | |||
*(b + 4) = *(a1 + 4); | |||
*(b + 5) = *(a1 + 5); | |||
*(b + 6) = *(a1 + 6); | |||
*(b + 7) = *(a1 + 7); | |||
*(b + 8) = *(a1 + 8); | |||
*(b + 9) = *(a1 + 9); | |||
*(b + 10) = *(a1 + 10); | |||
*(b + 11) = *(a1 + 11); | |||
*(b + 12) = *(a1 + 12); | |||
*(b + 13) = *(a1 + 13); | |||
*(b + 14) = *(a1 + 14); | |||
*(b + 15) = *(a1 + 15); | |||
*(b + 16) = *(a1 + 16); | |||
*(b + 17) = *(a1 + 17); | |||
*(b + 18) = *(a1 + 18); | |||
*(b + 19) = *(a1 + 19); | |||
*(b + 20) = *(a1 + 20); | |||
*(b + 21) = *(a1 + 21); | |||
*(b + 22) = *(a1 + 22); | |||
*(b + 23) = *(a1 + 23); | |||
*(b + 24) = *(a1 + 24); | |||
*(b + 25) = *(a1 + 25); | |||
*(b + 26) = *(a1 + 26); | |||
*(b + 27) = *(a1 + 27); | |||
*(b + 28) = *(a1 + 28); | |||
*(b + 29) = *(a1 + 29); | |||
*(b + 30) = *(a1 + 30); | |||
*(b + 31) = *(a1 + 31); | |||
} | |||
b += 32; | |||
a1 += lda; | |||
ii ++; | |||
} | |||
jj += 16; | |||
j --; | |||
} | |||
j = (n & 8); | |||
if (j > 0) { | |||
a1 = a; | |||
a += 16; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 8)) { | |||
for (k = 0; k < ii - jj; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1); | |||
} | |||
data1 = *(a1 + (ii - jj) * 2 + 0); | |||
data2 = *(a1 + (ii - jj) * 2 + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj >= 8) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a1 + 2); | |||
*(b + 3) = *(a1 + 3); | |||
*(b + 4) = *(a1 + 4); | |||
*(b + 5) = *(a1 + 5); | |||
*(b + 6) = *(a1 + 6); | |||
*(b + 7) = *(a1 + 7); | |||
*(b + 8) = *(a1 + 8); | |||
*(b + 9) = *(a1 + 9); | |||
*(b + 10) = *(a1 + 10); | |||
*(b + 11) = *(a1 + 11); | |||
*(b + 12) = *(a1 + 12); | |||
*(b + 13) = *(a1 + 13); | |||
*(b + 14) = *(a1 + 14); | |||
*(b + 15) = *(a1 + 15); | |||
} | |||
b += 16; | |||
a1 += lda; | |||
ii ++; | |||
} | |||
jj += 8; | |||
} | |||
j = (n & 4); | |||
if (j > 0) { | |||
a1 = a; | |||
a += 8; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 4)) { | |||
for (k = 0; k < ii - jj; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1); | |||
} | |||
data1 = *(a1 + (ii - jj) * 2 + 0); | |||
data2 = *(a1 + (ii - jj) * 2 + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj >= 4) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a1 + 2); | |||
*(b + 3) = *(a1 + 3); | |||
*(b + 4) = *(a1 + 4); | |||
*(b + 5) = *(a1 + 5); | |||
*(b + 6) = *(a1 + 6); | |||
*(b + 7) = *(a1 + 7); | |||
} | |||
b += 8; | |||
a1 += lda; | |||
ii ++; | |||
} | |||
jj += 4; | |||
} | |||
j = (n & 2); | |||
if (j > 0) { | |||
a1 = a; | |||
a += 4; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 2)) { | |||
for (k = 0; k < ii - jj; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1); | |||
} | |||
data1 = *(a1 + (ii - jj) * 2 + 0); | |||
data2 = *(a1 + (ii - jj) * 2 + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj >= 2) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
*(b + 2) = *(a1 + 2); | |||
*(b + 3) = *(a1 + 3); | |||
} | |||
b += 4; | |||
a1 += lda; | |||
ii ++; | |||
} | |||
jj += 2; | |||
} | |||
j = (n & 1); | |||
if (j > 0) { | |||
a1 = a; | |||
ii = 0; | |||
for (i = 0; i < m; i++) { | |||
if ((ii >= jj ) && (ii - jj < 1)) { | |||
for (k = 0; k < ii - jj; k ++) { | |||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0); | |||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1); | |||
} | |||
data1 = *(a1 + (ii - jj) * 2 + 0); | |||
data2 = *(a1 + (ii - jj) * 2 + 1); | |||
compinv(b + (ii - jj) * 2, data1, data2); | |||
} | |||
if (ii - jj >= 1) { | |||
*(b + 0) = *(a1 + 0); | |||
*(b + 1) = *(a1 + 1); | |||
} | |||
b += 2; | |||
a1 += lda; | |||
ii ++; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -111,12 +111,19 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMVNKERNEL = sgemv_n_8_lasx.S | |||
SGEMVTKERNEL = sgemv_t_8_lasx.S | |||
CGEMMKERNEL = cgemm_kernel_2x2_lsx.S | |||
CGEMMONCOPY = cgemm_ncopy_2_lsx.S | |||
CGEMMOTCOPY = cgemm_tcopy_2_lsx.S | |||
CGEMMKERNEL = cgemm_kernel_16x4_lasx.S | |||
CGEMMINCOPY = cgemm_ncopy_16_lasx.S | |||
CGEMMITCOPY = cgemm_tcopy_16_lasx.S | |||
CGEMMONCOPY = cgemm_ncopy_4_lasx.S | |||
CGEMMOTCOPY = cgemm_tcopy_4_lasx.S | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMVNKERNEL = cgemv_n_8_lasx.S | |||
CGEMVTKERNEL = cgemv_t_8_lasx.S | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
@@ -132,6 +139,9 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMVNKERNEL = zgemv_n_4_lasx.S | |||
ZGEMVTKERNEL = zgemv_t_4_lasx.S | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
@@ -0,0 +1,691 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
/* Function parameters */ | |||
#define M $r4 // param 1: m | |||
#define N $r5 // param 2: n | |||
#define SRC $r6 // param 3: src | |||
#define LDA $r7 // param 4: lda | |||
#define DST $r8 // param 5: dst | |||
#define I $r9 | |||
#define J $r10 | |||
#define S1 $r12 | |||
#define S2 $r13 | |||
#define S3 $r14 | |||
#define S4 $r15 | |||
#define S5 $r16 | |||
#define S6 $r17 | |||
#define S7 $r18 | |||
#define S8 $r19 | |||
#define S9 $r20 | |||
#define S10 $r23 | |||
#define S11 $r24 | |||
#define S12 $r25 | |||
#define S13 $r26 | |||
#define S14 $r27 | |||
#define S15 $r28 | |||
#define S16 $r29 | |||
#define TD $r30 | |||
#define TS $r31 | |||
#define TL $r7 | |||
#define T0 $r6 | |||
#define ZERO $r0 | |||
#define F0 $f0 | |||
#define F1 $f1 | |||
#define F2 $f2 | |||
#define F3 $f3 | |||
#define F4 $f4 | |||
#define F5 $f5 | |||
#define F6 $f6 | |||
#define F7 $f7 | |||
/* LASX vectors */ | |||
#define U0 $xr0 | |||
#define U1 $xr1 | |||
#define U2 $xr2 | |||
#define U3 $xr3 | |||
#define U4 $xr4 | |||
#define U5 $xr5 | |||
#define U6 $xr6 | |||
#define U7 $xr7 | |||
#define U8 $xr8 | |||
#define U9 $xr9 | |||
#define U10 $xr10 | |||
#define U11 $xr11 | |||
#define U12 $xr12 | |||
#define U13 $xr13 | |||
#define U14 $xr14 | |||
#define U15 $xr15 | |||
#define D0 $xr16 | |||
#define D1 $xr17 | |||
#define D2 $xr18 | |||
#define D3 $xr19 | |||
#define D4 $xr20 | |||
#define D5 $xr21 | |||
#define D6 $xr22 | |||
#define D7 $xr23 | |||
#define D8 $xr24 | |||
#define D9 $xr25 | |||
#define D10 $xr26 | |||
#define D11 $xr27 | |||
#define D12 $xr28 | |||
#define D13 $xr29 | |||
#define D14 $xr30 | |||
#define D15 $xr31 | |||
PROLOGUE | |||
addi.d $sp, $sp, -0x90 | |||
SDARG $r23, $sp, 0x00 | |||
SDARG $r24, $sp, 0x08 | |||
SDARG $r25, $sp, 0x10 | |||
SDARG $r26, $sp, 0x18 | |||
SDARG $r27, $sp, 0x20 | |||
SDARG $r28, $sp, 0x28 | |||
SDARG $r29, $sp, 0x30 | |||
SDARG $r30, $sp, 0x38 | |||
SDARG $r31, $sp, 0x40 | |||
ST $f23, $sp, 0x48 | |||
ST $f24, $sp, 0x50 | |||
ST $f25, $sp, 0x58 | |||
ST $f26, $sp, 0x60 | |||
ST $f27, $sp, 0x68 | |||
ST $f28, $sp, 0x70 | |||
ST $f29, $sp, 0x78 | |||
ST $f30, $sp, 0x80 | |||
ST $f31, $sp, 0x88 | |||
move TD, DST | |||
move TS, SRC | |||
slli.d TL, LDA, 0x03 | |||
slli.d T0, TL, 0x01 | |||
srai.d J, N, 0x04 | |||
beq J, ZERO, .L_N8 | |||
.L_J1: /* J-- */ | |||
move S1, TS | |||
add.d S2, TS, TL | |||
srai.d I, M, 0x03 | |||
add.d S3, S2, TL | |||
addi.d J, J, -1 | |||
add.d S4, S3, TL | |||
add.d S5, S3, T0 | |||
add.d S6, S4, T0 | |||
add.d S7, S5, T0 | |||
add.d S8, S6, T0 | |||
add.d S9, S7, T0 | |||
add.d S10, S8, T0 | |||
add.d S11, S9, T0 | |||
add.d S12, S10, T0 | |||
add.d S13, S11, T0 | |||
add.d S14, S12, T0 | |||
add.d S15, S13, T0 | |||
add.d S16, S14, T0 | |||
add.d TS, S15, T0 | |||
beq I, ZERO, .L_I7 | |||
.L_I1: /* I-- */ | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvld U4, S5, 0x00 | |||
xvld U5, S6, 0x00 | |||
xvld U6, S7, 0x00 | |||
xvld U7, S8, 0x00 | |||
xvld U8, S9, 0x00 | |||
xvld U9, S10, 0x00 | |||
xvld U10, S11, 0x00 | |||
xvld U11, S12, 0x00 | |||
xvld U12, S13, 0x00 | |||
xvld U13, S14, 0x00 | |||
xvld U14, S15, 0x00 | |||
xvld U15, S16, 0x00 | |||
xvpackev.d D0, U1, U0 | |||
xvpackod.d D1, U1, U0 | |||
xvpackev.d D2, U3, U2 | |||
xvpackod.d D3, U3, U2 | |||
xvpackev.d D4, U5, U4 | |||
xvpackod.d D5, U5, U4 | |||
xvpackev.d D6, U7, U6 | |||
xvpackod.d D7, U7, U6 | |||
xvpackev.d D8, U9, U8 | |||
xvpackod.d D9, U9, U8 | |||
xvpackev.d D10, U11, U10 | |||
xvpackod.d D11, U11, U10 | |||
xvpackev.d D12, U13, U12 | |||
xvpackod.d D13, U13, U12 | |||
xvpackev.d D14, U15, U14 | |||
xvpackod.d D15, U15, U14 | |||
xvand.v U0, D0, D0 | |||
xvpermi.q D0, D2, 0x02 // 0 | |||
xvand.v U4, D4, D4 | |||
xvpermi.q D4, D6, 0x02 // 1 | |||
xvand.v U1, D1, D1 | |||
xvpermi.q D1, D3, 0x02 // 4 | |||
xvand.v U5, D5, D5 | |||
xvpermi.q D5, D7, 0x02 // 5 | |||
xvpermi.q D2, U0, 0x31 // 8 | |||
xvpermi.q D6, U4, 0x31 // 9 | |||
xvpermi.q D3, U1, 0x31 // 12 | |||
xvpermi.q D7, U5, 0x31 // 13 | |||
xvand.v U8, D8, D8 | |||
xvpermi.q D8, D10, 0x02 // 2 | |||
xvand.v U12, D12, D12 | |||
xvpermi.q D12, D14, 0x02 // 3 | |||
xvand.v U9, D9, D9 | |||
xvpermi.q D9, D11, 0x02 // 6 | |||
xvand.v U13, D13, D13 | |||
xvpermi.q D13, D15, 0x02 // 7 | |||
xvpermi.q D10, U8, 0x31 // 10 | |||
xvpermi.q D14, U12, 0x31 // 11 | |||
xvpermi.q D11, U9, 0x31 // 14 | |||
xvpermi.q D15, U13, 0x31 // 15 | |||
xvst D0, TD, 0x00 // 0 | |||
xvst D4, TD, 0x20 // 1 | |||
xvst D8, TD, 0x40 // 2 | |||
xvst D12, TD, 0x60 // 3 | |||
xvst D1, TD, 0x80 // 4 | |||
xvst D5, TD, 0xA0 // 5 | |||
xvst D9, TD, 0xC0 // 6 | |||
xvst D13, TD, 0xE0 // 7 | |||
addi.d TD, TD, 0x100 | |||
xvst D2, TD, 0x00 // 8 | |||
xvst D6, TD, 0x20 // 9 | |||
xvst D10, TD, 0x40 // 10 | |||
xvst D14, TD, 0x60 // 11 | |||
xvst D3, TD, 0x80 // 12 | |||
xvst D7, TD, 0xA0 // 13 | |||
xvst D11, TD, 0xC0 // 14 | |||
xvst D15, TD, 0xE0 // 15 | |||
addi.d TD, TD, 0x100 | |||
xvld U0, S1, 0x20 | |||
xvld U1, S2, 0x20 | |||
xvld U2, S3, 0x20 | |||
xvld U3, S4, 0x20 | |||
xvld U4, S5, 0x20 | |||
xvld U5, S6, 0x20 | |||
xvld U6, S7, 0x20 | |||
xvld U7, S8, 0x20 | |||
xvld U8, S9, 0x20 | |||
xvld U9, S10, 0x20 | |||
xvld U10, S11, 0x20 | |||
xvld U11, S12, 0x20 | |||
xvld U12, S13, 0x20 | |||
xvld U13, S14, 0x20 | |||
xvld U14, S15, 0x20 | |||
xvld U15, S16, 0x20 | |||
xvpackev.d D0, U1, U0 | |||
xvpackod.d D1, U1, U0 | |||
xvpackev.d D2, U3, U2 | |||
xvpackod.d D3, U3, U2 | |||
xvpackev.d D4, U5, U4 | |||
xvpackod.d D5, U5, U4 | |||
xvpackev.d D6, U7, U6 | |||
xvpackod.d D7, U7, U6 | |||
xvpackev.d D8, U9, U8 | |||
xvpackod.d D9, U9, U8 | |||
xvpackev.d D10, U11, U10 | |||
xvpackod.d D11, U11, U10 | |||
xvpackev.d D12, U13, U12 | |||
xvpackod.d D13, U13, U12 | |||
xvpackev.d D14, U15, U14 | |||
xvpackod.d D15, U15, U14 | |||
xvand.v U0, D0, D0 | |||
xvpermi.q D0, D2, 0x02 // 0 | |||
xvand.v U4, D4, D4 | |||
xvpermi.q D4, D6, 0x02 // 1 | |||
xvand.v U1, D1, D1 | |||
xvpermi.q D1, D3, 0x02 // 4 | |||
xvand.v U5, D5, D5 | |||
xvpermi.q D5, D7, 0x02 // 5 | |||
xvpermi.q D2, U0, 0x31 // 8 | |||
xvpermi.q D6, U4, 0x31 // 9 | |||
xvpermi.q D3, U1, 0x31 // 12 | |||
xvpermi.q D7, U5, 0x31 // 13 | |||
xvand.v U8, D8, D8 | |||
xvpermi.q D8, D10, 0x02 // 2 | |||
xvand.v U12, D12, D12 | |||
xvpermi.q D12, D14, 0x02 // 3 | |||
xvand.v U9, D9, D9 | |||
xvpermi.q D9, D11, 0x02 // 6 | |||
xvand.v U13, D13, D13 | |||
xvpermi.q D13, D15, 0x02 // 7 | |||
xvpermi.q D10, U8, 0x31 // 10 | |||
xvpermi.q D14, U12, 0x31 // 11 | |||
xvpermi.q D11, U9, 0x31 // 14 | |||
xvpermi.q D15, U13, 0x31 // 15 | |||
xvst D0, TD, 0x00 // 0 | |||
xvst D4, TD, 0x20 // 1 | |||
xvst D8, TD, 0x40 // 2 | |||
xvst D12, TD, 0x60 // 3 | |||
xvst D1, TD, 0x80 // 4 | |||
xvst D5, TD, 0xA0 // 5 | |||
xvst D9, TD, 0xC0 // 6 | |||
xvst D13, TD, 0xE0 // 7 | |||
addi.d TD, TD, 0x100 | |||
xvst D2, TD, 0x00 // 8 | |||
xvst D6, TD, 0x20 // 9 | |||
xvst D10, TD, 0x40 // 10 | |||
xvst D14, TD, 0x60 // 11 | |||
xvst D3, TD, 0x80 // 12 | |||
xvst D7, TD, 0xA0 // 13 | |||
xvst D11, TD, 0xC0 // 14 | |||
xvst D15, TD, 0xE0 // 15 | |||
addi.d TD, TD, 0x100 | |||
addi.d S1, S1, 0x40 | |||
addi.d S2, S2, 0x40 | |||
addi.d S3, S3, 0x40 | |||
addi.d S4, S4, 0x40 | |||
addi.d S5, S5, 0x40 | |||
addi.d S6, S6, 0x40 | |||
addi.d S7, S7, 0x40 | |||
addi.d S8, S8, 0x40 | |||
addi.d S9, S9, 0x40 | |||
addi.d S10, S10, 0x40 | |||
addi.d S11, S11, 0x40 | |||
addi.d S12, S12, 0x40 | |||
addi.d S13, S13, 0x40 | |||
addi.d S14, S14, 0x40 | |||
addi.d S15, S15, 0x40 | |||
addi.d S16, S16, 0x40 | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_I1 | |||
.L_I7: | |||
andi I, M, 0x07 | |||
beq I, ZERO, .L_I0 | |||
.L_II1: /* I-- */ | |||
fld.d F0, S1, 0x00 | |||
fld.d F1, S2, 0x00 | |||
fld.d F2, S3, 0x00 | |||
fld.d F3, S4, 0x00 | |||
fld.d F4, S5, 0x00 | |||
fld.d F5, S6, 0x00 | |||
fld.d F6, S7, 0x00 | |||
fld.d F7, S8, 0x00 | |||
fst.d F0, TD, 0x00 | |||
addi.d S1, S1, 0x08 | |||
fst.d F1, TD, 0x08 | |||
addi.d S2, S2, 0x08 | |||
fst.d F2, TD, 0x10 | |||
addi.d S3, S3, 0x08 | |||
fst.d F3, TD, 0x18 | |||
addi.d S4, S4, 0x08 | |||
fst.d F4, TD, 0x20 | |||
addi.d S5, S5, 0x08 | |||
fst.d F5, TD, 0x28 | |||
addi.d S6, S6, 0x08 | |||
fst.d F6, TD, 0x30 | |||
addi.d S7, S7, 0x08 | |||
fst.d F7, TD, 0x38 | |||
addi.d S8, S8, 0x08 | |||
addi.d TD, TD, 0x40 | |||
fld.d F0, S9, 0x00 | |||
fld.d F1, S10, 0x00 | |||
fld.d F2, S11, 0x00 | |||
fld.d F3, S12, 0x00 | |||
fld.d F4, S13, 0x00 | |||
fld.d F5, S14, 0x00 | |||
fld.d F6, S15, 0x00 | |||
fld.d F7, S16, 0x00 | |||
fst.d F0, TD, 0x00 | |||
addi.d S9, S9, 0x08 | |||
fst.d F1, TD, 0x08 | |||
addi.d S10, S10, 0x08 | |||
fst.d F2, TD, 0x10 | |||
addi.d S11, S11, 0x08 | |||
fst.d F3, TD, 0x18 | |||
addi.d S12, S12, 0x08 | |||
fst.d F4, TD, 0x20 | |||
addi.d S13, S13, 0x08 | |||
fst.d F5, TD, 0x28 | |||
addi.d S14, S14, 0x08 | |||
fst.d F6, TD, 0x30 | |||
addi.d S15, S15, 0x08 | |||
fst.d F7, TD, 0x38 | |||
addi.d S16, S16, 0x08 | |||
addi.d TD, TD, 0x40 | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_II1 | |||
.L_I0: | |||
blt ZERO, J, .L_J1 | |||
.L_N8: | |||
andi J, N, 0x08 | |||
beq ZERO, J, .L_N4 | |||
move S1, TS | |||
add.d S2, TS, TL | |||
srai.d I, M, 0x03 | |||
add.d S3, S2, TL | |||
add.d S4, S2, T0 | |||
add.d S5, S3, T0 | |||
add.d S6, S4, T0 | |||
add.d S7, S5, T0 | |||
add.d S8, S6, T0 | |||
add.d TS, S7, T0 | |||
beq I, ZERO, .L_8I3 | |||
.L_8I1: /* I-- */ | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvld U4, S5, 0x00 | |||
xvld U5, S6, 0x00 | |||
xvld U6, S7, 0x00 | |||
xvld U7, S8, 0x00 | |||
xvpackev.d D0, U1, U0 | |||
xvpackod.d D1, U1, U0 | |||
xvpackev.d D2, U3, U2 | |||
xvpackod.d D3, U3, U2 | |||
xvpackev.d D4, U5, U4 | |||
xvpackod.d D5, U5, U4 | |||
xvpackev.d D6, U7, U6 | |||
xvpackod.d D7, U7, U6 | |||
xvand.v U0, D0, D0 | |||
xvpermi.q D0, D2, 0x02 // 0 | |||
xvand.v U4, D4, D4 | |||
xvpermi.q D4, D6, 0x02 // 1 | |||
xvand.v U1, D1, D1 | |||
xvpermi.q D1, D3, 0x02 // 2 | |||
xvand.v U5, D5, D5 | |||
xvpermi.q D5, D7, 0x02 // 3 | |||
xvpermi.q D2, U0, 0x31 // 4 | |||
xvpermi.q D6, U4, 0x31 // 5 | |||
xvpermi.q D3, U1, 0x31 // 6 | |||
xvpermi.q D7, U5, 0x31 // 7 | |||
xvst D0, TD, 0x00 | |||
xvst D4, TD, 0x20 | |||
xvst D1, TD, 0x40 | |||
xvst D5, TD, 0x60 | |||
xvst D2, TD, 0x80 | |||
xvst D6, TD, 0xA0 | |||
xvst D3, TD, 0xC0 | |||
xvst D7, TD, 0xE0 | |||
addi.d TD, TD, 0x100 | |||
xvld U0, S1, 0x20 | |||
xvld U1, S2, 0x20 | |||
xvld U2, S3, 0x20 | |||
xvld U3, S4, 0x20 | |||
xvld U4, S5, 0x20 | |||
xvld U5, S6, 0x20 | |||
xvld U6, S7, 0x20 | |||
xvld U7, S8, 0x20 | |||
xvpackev.d D0, U1, U0 | |||
xvpackod.d D1, U1, U0 | |||
xvpackev.d D2, U3, U2 | |||
xvpackod.d D3, U3, U2 | |||
xvpackev.d D4, U5, U4 | |||
xvpackod.d D5, U5, U4 | |||
xvpackev.d D6, U7, U6 | |||
xvpackod.d D7, U7, U6 | |||
xvand.v U0, D0, D0 | |||
xvpermi.q D0, D2, 0x02 // 0 | |||
xvand.v U4, D4, D4 | |||
xvpermi.q D4, D6, 0x02 // 1 | |||
xvand.v U1, D1, D1 | |||
xvpermi.q D1, D3, 0x02 // 2 | |||
xvand.v U5, D5, D5 | |||
xvpermi.q D5, D7, 0x02 // 3 | |||
xvpermi.q D2, U0, 0x31 // 4 | |||
xvpermi.q D6, U4, 0x31 // 5 | |||
xvpermi.q D3, U1, 0x31 // 6 | |||
xvpermi.q D7, U5, 0x31 // 7 | |||
xvst D0, TD, 0x00 | |||
xvst D4, TD, 0x20 | |||
xvst D1, TD, 0x40 | |||
xvst D5, TD, 0x60 | |||
xvst D2, TD, 0x80 | |||
xvst D6, TD, 0xA0 | |||
xvst D3, TD, 0xC0 | |||
xvst D7, TD, 0xE0 | |||
addi.d TD, TD, 0x100 | |||
addi.d S1, S1, 0x40 | |||
addi.d S2, S2, 0x40 | |||
addi.d S3, S3, 0x40 | |||
addi.d S4, S4, 0x40 | |||
addi.d S5, S5, 0x40 | |||
addi.d S6, S6, 0x40 | |||
addi.d S7, S7, 0x40 | |||
addi.d S8, S8, 0x40 | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_8I1 | |||
.L_8I3: | |||
andi I, M, 0x07 | |||
beq I, ZERO, .L_N4 | |||
.L_8I11: | |||
fld.d F0, S1, 0x00 | |||
fld.d F1, S2, 0x00 | |||
fld.d F2, S3, 0x00 | |||
fld.d F3, S4, 0x00 | |||
fld.d F4, S5, 0x00 | |||
fld.d F5, S6, 0x00 | |||
fld.d F6, S7, 0x00 | |||
fld.d F7, S8, 0x00 | |||
fst.d F0, TD, 0x00 | |||
addi.d S1, S1, 0x08 | |||
fst.d F1, TD, 0x08 | |||
addi.d S2, S2, 0x08 | |||
fst.d F2, TD, 0x10 | |||
addi.d S3, S3, 0x08 | |||
fst.d F3, TD, 0x18 | |||
addi.d S4, S4, 0x08 | |||
fst.d F4, TD, 0x20 | |||
addi.d S5, S5, 0x08 | |||
fst.d F5, TD, 0x28 | |||
addi.d S6, S6, 0x08 | |||
fst.d F6, TD, 0x30 | |||
addi.d S7, S7, 0x08 | |||
fst.d F7, TD, 0x38 | |||
addi.d S8, S8, 0x08 | |||
addi.d TD, TD, 0x40 | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_8I11 | |||
.L_N4: | |||
andi J, N, 0x04 | |||
beq ZERO, J, .L_N2 | |||
move S1, TS | |||
add.d S2, TS, TL | |||
srai.d I, M, 0x02 | |||
add.d S3, S2, TL | |||
add.d S4, S2, T0 | |||
add.d TS, S3, T0 | |||
beq I, ZERO, .L_I3 | |||
.L_4I1: /* I-- */ | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvpackev.d D0, U1, U0 | |||
xvpackod.d D1, U1, U0 | |||
xvpackev.d D2, U3, U2 | |||
xvpackod.d D3, U3, U2 | |||
xvand.v U0, D0, D0 | |||
xvpermi.q D0, D2, 0x02 // 0 | |||
xvand.v U1, D1, D1 | |||
xvpermi.q D1, D3, 0x02 // 1 | |||
xvpermi.q D2, U0, 0x31 // 2 | |||
xvpermi.q D3, U1, 0x31 // 3 | |||
xvst D0, TD, 0x00 | |||
xvst D1, TD, 0x20 | |||
xvst D2, TD, 0x40 | |||
xvst D3, TD, 0x60 | |||
addi.d S1, S1, 0x20 | |||
addi.d S2, S2, 0x20 | |||
addi.d S3, S3, 0x20 | |||
addi.d S4, S4, 0x20 | |||
addi.d TD, TD, 0x80 | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_4I1 | |||
.L_I3: | |||
andi I, M, 0x03 | |||
beq I, ZERO, .L_N2 | |||
.L_4II1: | |||
fld.d F0, S1, 0x00 | |||
fld.d F1, S2, 0x00 | |||
fld.d F2, S3, 0x00 | |||
fld.d F3, S4, 0x00 | |||
fst.d F0, TD, 0x00 | |||
addi.d S1, S1, 0x08 | |||
fst.d F1, TD, 0x08 | |||
addi.d S2, S2, 0x08 | |||
fst.d F2, TD, 0x10 | |||
addi.d S3, S3, 0x08 | |||
fst.d F3, TD, 0x18 | |||
addi.d S4, S4, 0x08 | |||
addi.d TD, TD, 0x20 | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_4II1 | |||
.L_N2: | |||
andi J, N, 0x02 | |||
beq ZERO, J, .L_N1 | |||
move S1, TS | |||
add.d S2, TS, TL | |||
srai.d I, M, 0x01 | |||
add.d TS, S2, TL | |||
beq I, ZERO, .L_NI1 | |||
.L_2I1: /* I-- */ | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvpackev.d D0, U1, U0 | |||
xvpackod.d D1, U1, U0 | |||
xvpermi.q D0, D1, 0x02 // 0 | |||
xvst D0, TD, 0x00 | |||
addi.d S1, S1, 0x10 | |||
addi.d S2, S2, 0x10 | |||
addi.d TD, TD, 0x20 | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_2I1 | |||
.L_NI1: | |||
andi I, M, 0x01 | |||
beq I, ZERO, .L_N1 | |||
fld.d F0, S1, 0x00 | |||
fld.d F1, S2, 0x00 | |||
fst.d F0, TD, 0x00 | |||
addi.d S1, S1, 0x08 | |||
fst.d F1, TD, 0x08 | |||
addi.d S2, S2, 0x08 | |||
addi.d TD, TD, 0x10 | |||
.L_N1: | |||
move S1, TS | |||
beq ZERO, M, .L_N0 | |||
.L_M1: | |||
fld.d F0, S1, 0x00 | |||
addi.d S1, S1, 0x08 | |||
fst.d F0, TD, 0x00 | |||
addi.d TD, TD, 0x08 | |||
addi.d M, M, -1 | |||
blt ZERO, M, .L_M1 | |||
.L_N0: | |||
LDARG $r23, $sp, 0x00 | |||
LDARG $r24, $sp, 0x08 | |||
LDARG $r25, $sp, 0x10 | |||
LDARG $r26, $sp, 0x18 | |||
LDARG $r27, $sp, 0x20 | |||
LDARG $r28, $sp, 0x28 | |||
LDARG $r29, $sp, 0x30 | |||
LDARG $r30, $sp, 0x38 | |||
LDARG $r31, $sp, 0x40 | |||
LD $f23, $sp, 0x48 | |||
LD $f24, $sp, 0x50 | |||
LD $f25, $sp, 0x58 | |||
LD $f26, $sp, 0x60 | |||
LD $f27, $sp, 0x68 | |||
LD $f28, $sp, 0x70 | |||
LD $f29, $sp, 0x78 | |||
LD $f30, $sp, 0x80 | |||
LD $f31, $sp, 0x88 | |||
addi.d $sp, $sp, 0x90 | |||
jirl $r0, $r1, 0x00 | |||
EPILOGUE |
@@ -0,0 +1,325 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
/* Function parameters */ | |||
#define M $r4 // param 1: m | |||
#define N $r5 // param 2: n | |||
#define SRC $r6 // param 3: src | |||
#define LDA $r7 // param 4: lda | |||
#define DST $r8 // param 5: dst | |||
#define I $r9 | |||
#define J $r10 | |||
#define S1 $r12 | |||
#define S2 $r13 | |||
#define S3 $r14 | |||
#define S4 $r15 | |||
#define S5 $r16 | |||
#define S6 $r17 | |||
#define S7 $r18 | |||
#define TD $r20 | |||
#define TS $r11 | |||
#define TL $r19 | |||
#define T0 $r23 | |||
#define ZERO $r0 | |||
#define F0 $f0 | |||
#define F1 $f1 | |||
#define F2 $f2 | |||
#define F3 $f3 | |||
#define F4 $f4 | |||
#define F5 $f5 | |||
#define F6 $f6 | |||
#define F7 $f7 | |||
/* LASX vectors */ | |||
#define U0 $xr0 | |||
#define U1 $xr1 | |||
#define U2 $xr2 | |||
#define U3 $xr3 | |||
#define U4 $xr4 | |||
#define U5 $xr5 | |||
#define U6 $xr6 | |||
#define U7 $xr7 | |||
#define D0 $xr8 | |||
#define D1 $xr9 | |||
#define D2 $xr10 | |||
#define D3 $xr11 | |||
#define D4 $xr12 | |||
#define D5 $xr13 | |||
#define D6 $xr14 | |||
#define D7 $xr15 | |||
#define D8 $xr16 | |||
PROLOGUE | |||
addi.d $sp, $sp, -8 | |||
SDARG $r23, $sp, 0 | |||
move TD, DST //boffset | |||
move TS, SRC //aoffset | |||
slli.d TL, LDA, 0x02 | |||
slli.d TL, TL, 0x01 | |||
srai.d J, N, 0x02 | |||
beq J, ZERO, .L_N0 | |||
.L_J1: /* J-- */ | |||
move S1, TS | |||
add.d S2, S1, TL | |||
add.d S3, S2, TL | |||
add.d S4, S3, TL | |||
slli.d T0, TL, 0x02 | |||
add.d TS, TS, T0 | |||
srai.d I, M, 0x02 | |||
beq I, ZERO, .L_I3 | |||
.L_I1: /* I-- */ | |||
xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8 | |||
xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16 | |||
xvld U2, S3, 0x00 //17 18 19 20 21 22 23 24 | |||
xvld U3, S4, 0x00 //25 26 27 28 29 30 31 32 | |||
xvand.v D0, U0, U0 | |||
xvand.v D1, U1, U1 | |||
xvand.v D2, U2, U2 | |||
xvand.v D3, U3, U3 | |||
xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14 | |||
xvshuf4i.d D2, U3, 0x88 //17 18 25 26 21 22 29 30 | |||
xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16 | |||
xvshuf4i.d D3, U2, 0x77 //19 20 27 28 23 24 31 32 | |||
xvand.v U4, D0, D0 | |||
xvand.v U5, D1, D1 | |||
xvpermi.q U4, D2, 0x02 //1 2 9 10 17 18 25 26 | |||
xvpermi.q U5, D3, 0x02 //3 4 11 12 19 20 27 28 | |||
xvpermi.q D2, D0, 0x31 //5 6 13 14 21 22 29 30 | |||
xvpermi.q D3, D1, 0x31 //7 8 15 16 23 24 31 32 | |||
xvst U4, TD, 0x00 | |||
xvst U5, TD, 0x20 | |||
xvst D2, TD, 0x40 | |||
xvst D3, TD, 0x60 | |||
addi.d S1, S1, 0x20 // a_offset | |||
addi.d S2, S2, 0x20 | |||
addi.d S3, S3, 0x20 | |||
addi.d S4, S4, 0x20 | |||
addi.d TD, TD, 0x80 // b_offset | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_I1 | |||
.L_I3: /* if(m&2) */ | |||
andi I, M, 0x02 | |||
beq I, ZERO, .L_II20 | |||
vld $vr0, S1, 0x00 | |||
vld $vr1, S2, 0x00 | |||
vld $vr2, S3, 0x00 | |||
vld $vr3, S4, 0x00 | |||
vand.v $vr8, $vr1, $vr1 | |||
vand.v $vr9, $vr1, $vr1 | |||
vand.v $vr10, $vr3, $vr3 | |||
vand.v $vr11, $vr3, $vr3 | |||
vpermi.w $vr8, $vr0, 0x44 | |||
vpermi.w $vr10, $vr2, 0x44 | |||
vpermi.w $vr9, $vr0, 0xee | |||
vpermi.w $vr11, $vr2, 0xee | |||
vst $vr8, TD, 0x00 | |||
vst $vr10, TD, 0x10 | |||
vst $vr9, TD, 0x20 | |||
vst $vr11, TD, 0x30 | |||
addi.d S1, S1, 0x10 | |||
addi.d S2, S2, 0x10 | |||
addi.d S3, S3, 0x10 | |||
addi.d S4, S4, 0x10 | |||
addi.d TD, TD, 0x40 | |||
.L_II20: /* if(m&1) */ | |||
andi I, M, 0x01 | |||
beq I, ZERO, .L_J0 | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S1, 0x04 | |||
fld.s F2, S2, 0x00 | |||
fld.s F3, S2, 0x04 | |||
fld.s F4, S3, 0x00 | |||
fld.s F5, S3, 0x04 | |||
fld.s F6, S4, 0x00 | |||
fld.s F7, S4, 0x04 | |||
fst.s F0, TD, 0x00 | |||
fst.s F1, TD, 0x04 | |||
fst.s F2, TD, 0x08 | |||
fst.s F3, TD, 0x0c | |||
fst.s F4, TD, 0x10 | |||
fst.s F5, TD, 0x14 | |||
fst.s F6, TD, 0x18 | |||
fst.s F7, TD, 0x1c | |||
addi.d TD, TD, 0x20 | |||
.L_J0: | |||
addi.d J, J, -1 | |||
blt ZERO, J, .L_J1 | |||
.L_N0: /* if(n&2) */ | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_N20 | |||
move S1, TS | |||
add.d S2, S1, TL | |||
slli.d T0, TL, 0x01 | |||
add.d TS, TS, T0 | |||
srai.d I, M, 0x02 | |||
beq ZERO, I, .L_N10 | |||
.L_N11: /* if(i>0) */ | |||
xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8 | |||
xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16 | |||
xvand.v D0, U0, U0 | |||
xvand.v D1, U1, U1 | |||
xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14 | |||
xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16 | |||
xvand.v U4, D0, D0 | |||
xvpermi.q U4, D1, 0x02 //1 2 9 10 3 4 11 12 | |||
xvpermi.q D1, D0, 0x31 //5 6 13 14 7 8 15 16 | |||
xvst U4, TD, 0x00 | |||
xvst D1, TD, 0x20 | |||
addi.d S1, S1, 0x20 // a_offset | |||
addi.d S2, S2, 0x20 | |||
addi.d TD, TD, 0x40 // b_offset | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_N11 | |||
.L_N10: /* if(m&2) */ | |||
andi I, M, 0x02 | |||
beq I, ZERO, .L_N130 | |||
vld $vr0, S1, 0x00 | |||
vld $vr1, S2, 0x00 | |||
vand.v $vr8, $vr1, $vr1 | |||
vpermi.w $vr8, $vr0, 0x44 | |||
vpermi.w $vr1, $vr0, 0xee | |||
vst $vr8, TD, 0x00 | |||
vst $vr1, TD, 0x10 | |||
addi.d S1, S1, 0x10 // a_offset | |||
addi.d S2, S2, 0x10 | |||
addi.d TD, TD, 0x20 // b_offset | |||
.L_N130: /* if(m&1) */ | |||
andi I, M, 0x01 | |||
beq I, ZERO, .L_N20 | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S1, 0x04 | |||
fld.s F2, S2, 0x00 | |||
fld.s F3, S2, 0x04 | |||
fst.s F0, TD, 0x00 | |||
fst.s F1, TD, 0x04 | |||
fst.s F2, TD, 0x08 | |||
fst.s F3, TD, 0x0c | |||
addi.d TD, TD, 0x10 | |||
.L_N20: /* if(n&1) */ | |||
andi I, N, 0x01 | |||
beq I, ZERO, .L_N00 | |||
move S1, TS | |||
srai.d I, M, 0x02 | |||
beq I, ZERO, .L_N30 | |||
.L_N21: /* if(i>0) */ | |||
xvld U0, S1, 0x00 | |||
xvst U0, TD, 0x00 | |||
addi.d S1, S1, 0x20 // aoffset1 | |||
addi.d TD, TD, 0x20 // b_offset | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_N21 | |||
.L_N30: /* if(m&2) */ | |||
andi I, M, 0x02 | |||
beq I, ZERO, .L_N330 | |||
vld $vr0, S1, 0x00 | |||
vst $vr0, TD, 0x00 | |||
addi.d S1, S1, 0x10 // aoffset1 | |||
addi.d TD, TD, 0x10 // b_offset | |||
.L_N330: /* if(m&1) */ | |||
andi I, M, 0x01 | |||
beq I, ZERO, .L_N00 | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S1, 0x04 | |||
fst.s F0, TD, 0x00 | |||
fst.s F1, TD, 0x04 | |||
.L_N00: | |||
LDARG $r23, $sp, 0 | |||
addi.d $sp, $sp, 8 | |||
jirl $r0, $r1, 0x00 | |||
EPILOGUE |
@@ -0,0 +1,741 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
/* Function parameters */ | |||
#define M $r4 // param 1: m | |||
#define N $r5 // param 2: n | |||
#define SRC $r6 // param 3: src | |||
#define LDA $r7 // param 4: lda | |||
#define DST $r8 // param 5: dst | |||
#define I $r9 | |||
#define J $r10 | |||
#define S0 $r11 | |||
#define S1 $r12 | |||
#define S2 $r13 | |||
#define S3 $r14 | |||
#define S4 $r15 | |||
#define S5 $r16 | |||
#define S6 $r17 | |||
#define S7 $r18 | |||
#define S8 $r19 | |||
#define P0 $r20 | |||
#define P1 $r23 | |||
#define P2 $r24 | |||
#define P3 $r25 | |||
#define P4 $r26 | |||
#define P5 $r27 | |||
#define T0 $r28 | |||
#define T1 $r29 | |||
#define TL $r7 | |||
#define ZERO $r0 | |||
#define F0 $f0 | |||
#define F1 $f1 | |||
#define F2 $f2 | |||
#define F3 $f3 | |||
#define F4 $f4 | |||
#define F5 $f5 | |||
#define F6 $f6 | |||
#define F7 $f7 | |||
#define F8 $f8 | |||
#define F9 $f9 | |||
#define F10 $f10 | |||
#define F11 $f11 | |||
#define F12 $f12 | |||
#define F13 $f13 | |||
#define F14 $f14 | |||
#define F15 $f15 | |||
/* LASX vectors */ | |||
#define U0 $xr0 | |||
#define U1 $xr1 | |||
#define U2 $xr2 | |||
#define U3 $xr3 | |||
#define U4 $xr4 | |||
#define U5 $xr5 | |||
#define U6 $xr6 | |||
#define U7 $xr7 | |||
PROLOGUE | |||
addi.d $sp, $sp, -56 | |||
SDARG $r23, $sp, 0 | |||
SDARG $r24, $sp, 8 | |||
SDARG $r25, $sp, 16 | |||
SDARG $r26, $sp, 24 | |||
SDARG $r27, $sp, 32 | |||
SDARG $r28, $sp, 40 | |||
SDARG $r29, $sp, 48 | |||
move S0, SRC | |||
move P0, DST | |||
srai.d T0, N, 0x04 | |||
srai.d T1, N, 0x03 | |||
slli.d T0, T0, 0x04 | |||
slli.d T1, T1, 0x03 | |||
mul.d P2, M, T0 | |||
mul.d P3, M, T1 | |||
slli.d P2, P2, 0x03 | |||
slli.d P3, P3, 0x03 | |||
add.d P2, DST, P2 | |||
add.d P3, DST, P3 | |||
srai.d T0, N, 0x02 | |||
srai.d T1, N, 0x01 | |||
slli.d T0, T0, 0x02 | |||
slli.d T1, T1, 0x01 | |||
mul.d P4, M, T0 | |||
mul.d P5, M, T1 | |||
slli.d P4, P4, 0x03 | |||
slli.d P5, P5, 0x03 | |||
add.d P4, DST, P4 | |||
add.d P5, DST, P5 | |||
slli.d TL, LDA, 0x03 | |||
srai.d J, M, 0x03 | |||
slli.d T0, TL, 0x01 | |||
slli.d T1, M, 0x07 | |||
beq ZERO, J, .L_M7 | |||
.L_J1: /* J-- */ | |||
move S1, S0 | |||
add.d S2, S0, TL | |||
add.d S3, S1, T0 | |||
add.d S4, S2, T0 | |||
add.d S5, S3, T0 | |||
add.d S6, S4, T0 | |||
add.d S7, S5, T0 | |||
add.d S8, S6, T0 | |||
add.d S0, S7, T0 | |||
move P1, P0 | |||
addi.d P0, P0, 0x400 | |||
srai.d I, N, 0x04 | |||
addi.d J, J, -1 | |||
beq ZERO, I, .L_N15 | |||
.L_I1: /* I-- */ | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvld U2, S1, 0x40 | |||
xvld U3, S1, 0x60 | |||
xvld U4, S2, 0x00 | |||
xvld U5, S2, 0x20 | |||
xvld U6, S2, 0x40 | |||
xvld U7, S2, 0x60 | |||
xvst U0, P1, 0x00 | |||
xvst U1, P1, 0x20 | |||
xvst U2, P1, 0x40 | |||
xvst U3, P1, 0x60 | |||
xvst U4, P1, 0x80 | |||
xvst U5, P1, 0xA0 | |||
xvst U6, P1, 0xC0 | |||
xvst U7, P1, 0xE0 | |||
xvld U0, S3, 0x00 | |||
xvld U1, S3, 0x20 | |||
xvld U2, S3, 0x40 | |||
xvld U3, S3, 0x60 | |||
xvld U4, S4, 0x00 | |||
xvld U5, S4, 0x20 | |||
xvld U6, S4, 0x40 | |||
xvld U7, S4, 0x60 | |||
xvst U0, P1, 0x100 | |||
xvst U1, P1, 0x120 | |||
xvst U2, P1, 0x140 | |||
xvst U3, P1, 0x160 | |||
xvst U4, P1, 0x180 | |||
xvst U5, P1, 0x1A0 | |||
xvst U6, P1, 0x1C0 | |||
xvst U7, P1, 0x1E0 | |||
xvld U0, S5, 0x00 | |||
xvld U1, S5, 0x20 | |||
xvld U2, S5, 0x40 | |||
xvld U3, S5, 0x60 | |||
xvld U4, S6, 0x00 | |||
xvld U5, S6, 0x20 | |||
xvld U6, S6, 0x40 | |||
xvld U7, S6, 0x60 | |||
xvst U0, P1, 0x200 | |||
xvst U1, P1, 0x220 | |||
xvst U2, P1, 0x240 | |||
xvst U3, P1, 0x260 | |||
xvst U4, P1, 0x280 | |||
xvst U5, P1, 0x2A0 | |||
xvst U6, P1, 0x2C0 | |||
xvst U7, P1, 0x2E0 | |||
xvld U0, S7, 0x00 | |||
xvld U1, S7, 0x20 | |||
xvld U2, S7, 0x40 | |||
xvld U3, S7, 0x60 | |||
xvld U4, S8, 0x00 | |||
xvld U5, S8, 0x20 | |||
xvld U6, S8, 0x40 | |||
xvld U7, S8, 0x60 | |||
xvst U0, P1, 0x300 | |||
xvst U1, P1, 0x320 | |||
xvst U2, P1, 0x340 | |||
xvst U3, P1, 0x360 | |||
xvst U4, P1, 0x380 | |||
xvst U5, P1, 0x3A0 | |||
xvst U6, P1, 0x3C0 | |||
xvst U7, P1, 0x3E0 | |||
addi.d S1, S1, 0x80 | |||
addi.d S2, S2, 0x80 | |||
addi.d S3, S3, 0x80 | |||
addi.d S4, S4, 0x80 | |||
addi.d S5, S5, 0x80 | |||
addi.d S6, S6, 0x80 | |||
addi.d S7, S7, 0x80 | |||
addi.d S8, S8, 0x80 | |||
addi.d I, I, -1 | |||
add.d P1, P1, T1 | |||
blt ZERO, I, .L_I1 | |||
.L_N15: | |||
andi I, N, 0x08 | |||
beq ZERO, I, .L_N7 | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvld U2, S2, 0x00 | |||
xvld U3, S2, 0x20 | |||
xvld U4, S3, 0x00 | |||
xvld U5, S3, 0x20 | |||
xvld U6, S4, 0x00 | |||
xvld U7, S4, 0x20 | |||
xvst U0, P2, 0x00 | |||
xvst U1, P2, 0x20 | |||
xvst U2, P2, 0x40 | |||
xvst U3, P2, 0x60 | |||
xvst U4, P2, 0x80 | |||
xvst U5, P2, 0xA0 | |||
xvst U6, P2, 0xC0 | |||
xvst U7, P2, 0xE0 | |||
xvld U0, S5, 0x00 | |||
xvld U1, S5, 0x20 | |||
xvld U2, S6, 0x00 | |||
xvld U3, S6, 0x20 | |||
xvld U4, S7, 0x00 | |||
xvld U5, S7, 0x20 | |||
xvld U6, S8, 0x00 | |||
xvld U7, S8, 0x20 | |||
xvst U0, P2, 0x100 | |||
xvst U1, P2, 0x120 | |||
xvst U2, P2, 0x140 | |||
xvst U3, P2, 0x160 | |||
xvst U4, P2, 0x180 | |||
xvst U5, P2, 0x1A0 | |||
xvst U6, P2, 0x1C0 | |||
xvst U7, P2, 0x1E0 | |||
addi.d S1, S1, 0x40 | |||
addi.d S2, S2, 0x40 | |||
addi.d S3, S3, 0x40 | |||
addi.d S4, S4, 0x40 | |||
addi.d S5, S5, 0x40 | |||
addi.d S6, S6, 0x40 | |||
addi.d S7, S7, 0x40 | |||
addi.d S8, S8, 0x40 | |||
addi.d P2, P2, 0x200 | |||
.L_N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_N3 | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvld U4, S5, 0x00 | |||
xvld U5, S6, 0x00 | |||
xvld U6, S7, 0x00 | |||
xvld U7, S8, 0x00 | |||
xvst U0, P3, 0x00 | |||
xvst U1, P3, 0x20 | |||
xvst U2, P3, 0x40 | |||
xvst U3, P3, 0x60 | |||
xvst U4, P3, 0x80 | |||
xvst U5, P3, 0xA0 | |||
xvst U6, P3, 0xC0 | |||
xvst U7, P3, 0xE0 | |||
addi.d S1, S1, 0x20 | |||
addi.d S2, S2, 0x20 | |||
addi.d S3, S3, 0x20 | |||
addi.d S4, S4, 0x20 | |||
addi.d S5, S5, 0x20 | |||
addi.d S6, S6, 0x20 | |||
addi.d S7, S7, 0x20 | |||
addi.d S8, S8, 0x20 | |||
addi.d P3, P3, 0x100 | |||
.L_N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_N1 | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvld U4, S5, 0x00 | |||
xvld U5, S6, 0x00 | |||
xvld U6, S7, 0x00 | |||
xvld U7, S8, 0x00 | |||
xvpermi.q U0, U1, 0x02 | |||
xvpermi.q U2, U3, 0x02 | |||
xvpermi.q U4, U5, 0x02 | |||
xvpermi.q U6, U7, 0x02 | |||
xvst U0, P4, 0x00 | |||
xvst U2, P4, 0x20 | |||
xvst U4, P4, 0x40 | |||
xvst U6, P4, 0x60 | |||
addi.d S1, S1, 0x10 | |||
addi.d S2, S2, 0x10 | |||
addi.d S3, S3, 0x10 | |||
addi.d S4, S4, 0x10 | |||
addi.d S5, S5, 0x10 | |||
addi.d S6, S6, 0x10 | |||
addi.d S7, S7, 0x10 | |||
addi.d S8, S8, 0x10 | |||
addi.d P4, P4, 0x80 | |||
.L_N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_N0 | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S1, 0x04 | |||
fld.s F2, S2, 0x00 | |||
fld.s F3, S2, 0x04 | |||
fld.s F4, S3, 0x00 | |||
fld.s F5, S3, 0x04 | |||
fld.s F6, S4, 0x00 | |||
fld.s F7, S4, 0x04 | |||
fld.s F8, S5, 0x00 | |||
fld.s F9, S5, 0x04 | |||
fld.s F10, S6, 0x00 | |||
fld.s F11, S6, 0x04 | |||
fld.s F12, S7, 0x00 | |||
fld.s F13, S7, 0x04 | |||
fld.s F14, S8, 0x00 | |||
fld.s F15, S8, 0x04 | |||
fst.s F0, P5, 0x00 | |||
fst.s F1, P5, 0x04 | |||
fst.s F2, P5, 0x08 | |||
fst.s F3, P5, 0x0c | |||
fst.s F4, P5, 0x10 | |||
fst.s F5, P5, 0x14 | |||
fst.s F6, P5, 0x18 | |||
fst.s F7, P5, 0x1c | |||
fst.s F8, P5, 0x20 | |||
fst.s F9, P5, 0x24 | |||
fst.s F10, P5, 0x28 | |||
fst.s F11, P5, 0x2c | |||
fst.s F12, P5, 0x30 | |||
fst.s F13, P5, 0x34 | |||
fst.s F14, P5, 0x38 | |||
fst.s F15, P5, 0x3c | |||
addi.d S1, S1, 0x08 | |||
addi.d S2, S2, 0x08 | |||
addi.d S3, S3, 0x08 | |||
addi.d S4, S4, 0x08 | |||
addi.d S5, S5, 0x08 | |||
addi.d S6, S6, 0x08 | |||
addi.d S7, S7, 0x08 | |||
addi.d S8, S8, 0x08 | |||
addi.d P5, P5, 0x40 | |||
.L_N0: | |||
blt ZERO, J, .L_J1 | |||
.L_M7: | |||
andi J, M, 0x04 | |||
beq ZERO, J, .L_M3 | |||
move S1, S0 | |||
add.d S2, S0, TL | |||
add.d S3, S1, T0 | |||
add.d S4, S2, T0 | |||
add.d S0, S3, T0 | |||
move P1, P0 | |||
addi.d P0, P0, 0x200 | |||
srai.d I, N, 0x04 | |||
beq ZERO, I, .L_4N15 | |||
.L_4I1: /* I-- */ | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvld U2, S1, 0x40 | |||
xvld U3, S1, 0x60 | |||
xvld U4, S2, 0x00 | |||
xvld U5, S2, 0x20 | |||
xvld U6, S2, 0x40 | |||
xvld U7, S2, 0x60 | |||
xvst U0, P1, 0x00 | |||
xvst U1, P1, 0x20 | |||
xvst U2, P1, 0x40 | |||
xvst U3, P1, 0x60 | |||
xvst U4, P1, 0x80 | |||
xvst U5, P1, 0xA0 | |||
xvst U6, P1, 0xC0 | |||
xvst U7, P1, 0xE0 | |||
xvld U0, S3, 0x00 | |||
xvld U1, S3, 0x20 | |||
xvld U2, S3, 0x40 | |||
xvld U3, S3, 0x60 | |||
xvld U4, S4, 0x00 | |||
xvld U5, S4, 0x20 | |||
xvld U6, S4, 0x40 | |||
xvld U7, S4, 0x60 | |||
xvst U0, P1, 0x100 | |||
xvst U1, P1, 0x120 | |||
xvst U2, P1, 0x140 | |||
xvst U3, P1, 0x160 | |||
xvst U4, P1, 0x180 | |||
xvst U5, P1, 0x1A0 | |||
xvst U6, P1, 0x1C0 | |||
xvst U7, P1, 0x1E0 | |||
addi.d S1, S1, 0x80 | |||
addi.d S2, S2, 0x80 | |||
addi.d S3, S3, 0x80 | |||
addi.d S4, S4, 0x80 | |||
addi.d I, I, -1 | |||
add.d P1, P1, T1 | |||
blt ZERO, I, .L_4I1 | |||
.L_4N15: | |||
andi I, N, 0x08 | |||
beq ZERO, I, .L_4N7 | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvld U2, S2, 0x00 | |||
xvld U3, S2, 0x20 | |||
xvld U4, S3, 0x00 | |||
xvld U5, S3, 0x20 | |||
xvld U6, S4, 0x00 | |||
xvld U7, S4, 0x20 | |||
xvst U0, P2, 0x00 | |||
xvst U1, P2, 0x20 | |||
xvst U2, P2, 0x40 | |||
xvst U3, P2, 0x60 | |||
xvst U4, P2, 0x80 | |||
xvst U5, P2, 0xA0 | |||
xvst U6, P2, 0xC0 | |||
xvst U7, P2, 0xE0 | |||
addi.d S1, S1, 0x40 | |||
addi.d S2, S2, 0x40 | |||
addi.d S3, S3, 0x40 | |||
addi.d S4, S4, 0x40 | |||
addi.d P2, P2, 0x100 | |||
.L_4N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_4N3 | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvst U0, P3, 0x00 | |||
xvst U1, P3, 0x20 | |||
xvst U2, P3, 0x40 | |||
xvst U3, P3, 0x60 | |||
addi.d S1, S1, 0x20 | |||
addi.d S2, S2, 0x20 | |||
addi.d S3, S3, 0x20 | |||
addi.d S4, S4, 0x20 | |||
addi.d P3, P3, 0x80 | |||
.L_4N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_4N1 | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvpermi.q U0, U1, 0x02 | |||
xvpermi.q U2, U3, 0x02 | |||
xvst U0, P4, 0x00 | |||
xvst U2, P4, 0x20 | |||
addi.d S1, S1, 0x10 | |||
addi.d S2, S2, 0x10 | |||
addi.d S3, S3, 0x10 | |||
addi.d S4, S4, 0x10 | |||
addi.d P4, P4, 0x40 | |||
.L_4N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M3 | |||
fld.d F0, S1, 0x00 | |||
fld.d F1, S2, 0x00 | |||
fld.d F2, S3, 0x00 | |||
fld.d F3, S4, 0x00 | |||
fst.d F0, P5, 0x00 | |||
fst.d F1, P5, 0x08 | |||
fst.d F2, P5, 0x10 | |||
fst.d F3, P5, 0x18 | |||
addi.d S1, S1, 0x08 | |||
addi.d S2, S2, 0x08 | |||
addi.d S3, S3, 0x08 | |||
addi.d S4, S4, 0x08 | |||
addi.d P5, P5, 0x20 | |||
.L_M3: | |||
andi J, M, 0x02 | |||
beq ZERO, J, .L_M1 | |||
move S1, S0 | |||
add.d S2, S0, TL | |||
add.d S0, S0, T0 | |||
move P1, P0 | |||
addi.d P0, P0, 0x100 | |||
srai.d I, N, 0x04 | |||
beq ZERO, I, .L_2N15 | |||
.L_2I1: /* I-- */ | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvld U2, S1, 0x40 | |||
xvld U3, S1, 0x60 | |||
xvld U4, S2, 0x00 | |||
xvld U5, S2, 0x20 | |||
xvld U6, S2, 0x40 | |||
xvld U7, S2, 0x60 | |||
xvst U0, P1, 0x00 | |||
xvst U1, P1, 0x20 | |||
xvst U2, P1, 0x40 | |||
xvst U3, P1, 0x60 | |||
xvst U4, P1, 0x80 | |||
xvst U5, P1, 0xA0 | |||
xvst U6, P1, 0xC0 | |||
xvst U7, P1, 0xE0 | |||
addi.d S1, S1, 0x80 | |||
addi.d S2, S2, 0x80 | |||
addi.d I, I, -1 | |||
add.d P1, P1, T1 | |||
blt ZERO, I, .L_2I1 | |||
.L_2N15: | |||
andi I, N, 0x08 | |||
beq ZERO, I, .L_2N7 | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvld U2, S2, 0x00 | |||
xvld U3, S2, 0x20 | |||
xvst U0, P2, 0x00 | |||
xvst U1, P2, 0x20 | |||
xvst U2, P2, 0x40 | |||
xvst U3, P2, 0x60 | |||
addi.d S1, S1, 0x40 | |||
addi.d S2, S2, 0x40 | |||
addi.d P2, P2, 0x80 | |||
.L_2N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_2N3 | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvst U0, P3, 0x00 | |||
xvst U1, P3, 0x20 | |||
addi.d S1, S1, 0x20 | |||
addi.d S2, S2, 0x20 | |||
addi.d P3, P3, 0x40 | |||
.L_2N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_2N1 | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvpermi.q U0, U1, 0x02 | |||
xvst U0, P4, 0x00 | |||
addi.d S1, S1, 0x10 | |||
addi.d S2, S2, 0x10 | |||
addi.d P4, P4, 0x20 | |||
.L_2N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M1 | |||
fld.d F0, S1, 0x00 | |||
fld.d F1, S2, 0x00 | |||
fst.d F0, P5, 0x00 | |||
fst.d F1, P5, 0x08 | |||
addi.d S1, S1, 0x08 | |||
addi.d S2, S2, 0x08 | |||
addi.d P5, P5, 0x10 | |||
.L_M1: | |||
andi J, M, 0x01 | |||
beq ZERO, J, .L_M0 | |||
move S1, S0 | |||
add.d S2, S0, TL | |||
move P1, P0 | |||
addi.d P0, P0, 0x80 | |||
srai.d I, N, 0x04 | |||
beq ZERO, I, .L_1N15 | |||
.L_1I1: /* I-- */ | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvld U2, S1, 0x40 | |||
xvld U3, S1, 0x60 | |||
xvst U0, P1, 0x00 | |||
xvst U1, P1, 0x20 | |||
xvst U2, P1, 0x40 | |||
xvst U3, P1, 0x60 | |||
addi.d S1, S1, 0x80 | |||
addi.d I, I, -1 | |||
add.d P1, P1, T1 | |||
blt ZERO, I, .L_1I1 | |||
.L_1N15: | |||
andi I, N, 0x08 | |||
beq ZERO, I, .L_1N7 | |||
xvld U0, S1, 0x00 | |||
xvld U1, S1, 0x20 | |||
xvst U0, P2, 0x00 | |||
xvst U1, P2, 0x20 | |||
addi.d S1, S1, 0x40 | |||
addi.d P2, P2, 0x40 | |||
.L_1N7: | |||
andi I, N, 0x04 | |||
beq ZERO, I, .L_1N3 | |||
xvld U0, S1, 0x00 | |||
xvst U0, P3, 0x00 | |||
addi.d S1, S1, 0x20 | |||
addi.d P3, P3, 0x20 | |||
.L_1N3: | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_1N1 | |||
fld.d F0, S1, 0x00 | |||
fld.d F1, S1, 0x08 | |||
fst.d F0, P4, 0x00 | |||
fst.d F1, P4, 0x08 | |||
addi.d S1, S1, 0x10 | |||
addi.d P4, P4, 0x10 | |||
.L_1N1: | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M0 | |||
fld.d F0, S1, 0x00 | |||
fst.d F0, P5, 0x00 | |||
addi.d S1, S1, 0x08 | |||
addi.d P5, P5, 0x08 | |||
.L_M0: | |||
LDARG $r23, $sp, 0 | |||
LDARG $r24, $sp, 8 | |||
LDARG $r25, $sp, 16 | |||
LDARG $r26, $sp, 24 | |||
LDARG $r27, $sp, 32 | |||
LDARG $r28, $sp, 40 | |||
LDARG $r29, $sp, 48 | |||
addi.d $sp, $sp, 56 | |||
jirl $r0, $r1, 0x00 | |||
EPILOGUE |
@@ -0,0 +1,306 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
/* Function parameters */ | |||
#define M $r4 // param 1: m | |||
#define N $r5 // param 2: n | |||
#define SRC $r6 // param 3: src | |||
#define LDA $r7 // param 4: lda | |||
#define DST $r8 // param 5: dst | |||
#define I $r9 | |||
#define J $r10 | |||
#define S1 $r12 | |||
#define S2 $r13 | |||
#define S3 $r14 | |||
#define S4 $r15 | |||
#define TD $r16 | |||
#define TS $r17 | |||
#define TL $r18 | |||
#define T0 $r19 | |||
#define S8 $r20 | |||
#define S9 $r23 | |||
#define S10 $r11 | |||
#define ZERO $r0 | |||
#define F0 $f0 | |||
#define F1 $f1 | |||
#define F2 $f2 | |||
#define F3 $f3 | |||
#define F4 $f4 | |||
#define F5 $f5 | |||
#define F6 $f6 | |||
#define F7 $f7 | |||
/* LASX vectors */ | |||
#define U0 $xr0 | |||
#define U1 $xr1 | |||
#define U2 $xr2 | |||
#define U3 $xr3 | |||
#define U4 $xr4 | |||
#define U5 $xr5 | |||
#define U6 $xr6 | |||
#define U7 $xr7 | |||
#define U8 $xr8 | |||
#define U9 $xr9 | |||
#define U10 $xr10 | |||
#define U11 $xr11 | |||
#define U12 $xr12 | |||
#define U13 $xr13 | |||
#define U14 $xr14 | |||
#define U15 $xr15 | |||
PROLOGUE | |||
addi.d $sp, $sp, -8 | |||
SDARG $r23, $sp, 0 | |||
move TS, SRC //aoffset | |||
move TD, DST //boffset | |||
slli.d TL, LDA, 0x02 //lda | |||
slli.d TL, TL, 0x01 //lda | |||
ori T0, ZERO, 0x03 | |||
andn T0, N, T0 | |||
mul.w T0, M, T0 | |||
slli.d T0, T0, 0x01 | |||
slli.d T0, T0, 0x02 | |||
add.d S9, DST, T0 //boffset2 | |||
ori T0, ZERO, 0x01 | |||
andn T0, N, T0 | |||
mul.w T0, M, T0 | |||
slli.d T0, T0, 0x01 | |||
slli.d T0, T0, 0x02 | |||
add.d S10, DST, T0 //boffset3 | |||
srai.d J, M, 0x02 //j | |||
beq J, ZERO, .L_M1 | |||
.L_J1: /* if(j>0) j--*/ | |||
move S1, TS //aoffset1 | |||
add.d S2, S1, TL | |||
add.d S3, S2, TL | |||
add.d S4, S3, TL | |||
slli.d T0, TL, 0x02 | |||
add.d TS, TS, T0 | |||
move S8, TD //boffset1 | |||
addi.d TD, TD, 0x80 | |||
srai.d I, N, 0x02 | |||
beq ZERO, I, .L_JN1 | |||
.L_JI1: /* if(i>0) i--*/ | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvld U2, S3, 0x00 | |||
xvld U3, S4, 0x00 | |||
xvst U0, S8, 0x00 | |||
xvst U1, S8, 0x20 | |||
xvst U2, S8, 0x40 | |||
xvst U3, S8, 0x60 | |||
addi.d S1, S1, 0x20 | |||
addi.d S2, S2, 0x20 | |||
addi.d S3, S3, 0x20 | |||
addi.d S4, S4, 0x20 | |||
slli.d T0, M, 0x05 | |||
add.d S8, S8, T0 | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_JI1 | |||
.L_JN1: /* if(n&2) */ | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_JN2 | |||
vld $vr0, S1, 0x00 | |||
vld $vr1, S2, 0x00 | |||
vld $vr2, S3, 0x00 | |||
vld $vr3, S4, 0x00 | |||
vst $vr0, S9, 0x00 | |||
vst $vr1, S9, 0x10 | |||
vst $vr2, S9, 0x20 | |||
vst $vr3, S9, 0x30 | |||
addi.d S1, S1, 0x10 | |||
addi.d S2, S2, 0x10 | |||
addi.d S3, S3, 0x10 | |||
addi.d S4, S4, 0x10 | |||
addi.d S9, S9, 0x40 | |||
.L_JN2: /* if(n&1) */ | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_J0 | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S1, 0x04 | |||
fld.s F2, S2, 0x00 | |||
fld.s F3, S2, 0x04 | |||
fld.s F4, S3, 0x00 | |||
fld.s F5, S3, 0x04 | |||
fld.s F6, S4, 0x00 | |||
fld.s F7, S4, 0x04 | |||
fst.s F0, S10, 0x00 | |||
fst.s F1, S10, 0x04 | |||
fst.s F2, S10, 0x08 | |||
fst.s F3, S10, 0x0c | |||
fst.s F4, S10, 0x10 | |||
fst.s F5, S10, 0x14 | |||
fst.s F6, S10, 0x18 | |||
fst.s F7, S10, 0x1c | |||
addi.d S10, S10, 0x20 | |||
.L_J0: | |||
addi.d J, J, -1 | |||
blt ZERO, J, .L_J1 | |||
.L_M1: /* if(m&2) */ | |||
andi I, M, 0x02 | |||
beq ZERO, I, .L_M2 | |||
move S1, TS //aoffset1 | |||
add.d S2, S1, TL | |||
slli.d T0, TL, 0x01 | |||
add.d TS, TS, T0 | |||
move S8, TD //boffset1 | |||
addi.d TD, TD, 0x40 | |||
srai.d I, N, 0x02 | |||
beq ZERO, I, .L_M1N1 | |||
.L_M1I1: /* if(i>0) */ | |||
xvld U0, S1, 0x00 | |||
xvld U1, S2, 0x00 | |||
xvst U0, S8, 0x00 | |||
xvst U1, S8, 0x20 | |||
addi.d S1, S1, 0x20 | |||
addi.d S2, S2, 0x20 | |||
slli.d T0, M, 0x05 | |||
add.d S8, S8, T0 | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_M1I1 | |||
.L_M1N1: /* if(n&2) */ | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_M1N2 | |||
vld $vr0, S1, 0x00 | |||
vld $vr1, S2, 0x00 | |||
vst $vr0, S9, 0x00 | |||
vst $vr1, S9, 0x10 | |||
addi.d S1, S1, 0x10 | |||
addi.d S2, S2, 0x10 | |||
addi.d S9, S9, 0x20 | |||
.L_M1N2: /* if(n&1) */ | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M2 | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S1, 0x04 | |||
fld.s F2, S2, 0x00 | |||
fld.s F3, S2, 0x04 | |||
fst.s F0, S10, 0x00 | |||
fst.s F1, S10, 0x04 | |||
fst.s F2, S10, 0x08 | |||
fst.s F3, S10, 0x0c | |||
addi.d S10, S10, 0x10 | |||
.L_M2: /* if(m&1) */ | |||
andi I, M, 0x01 | |||
beq ZERO, I, .L_M0 | |||
move S1, TS //aoffset1 | |||
move S8, TD //boffset1 | |||
srai.d I, N, 0x02 | |||
beq ZERO, I, .L_M2N1 | |||
.L_M2I1: /* if(i>0) */ | |||
xvld U0, S1, 0x00 | |||
xvst U0, S8, 0x00 | |||
addi.d S1, S1, 0x20 | |||
slli.d T0, M, 0x05 | |||
add.d S8, S8, T0 | |||
addi.d I, I, -1 | |||
blt ZERO, I, .L_M2I1 | |||
.L_M2N1: /* if(n&2) */ | |||
andi I, N, 0x02 | |||
beq ZERO, I, .L_M2N2 | |||
vld $vr0, S1, 0x00 | |||
vst $vr0, S9, 0x00 | |||
addi.d S1, S1, 0x10 | |||
.L_M2N2: /* if(n&1) */ | |||
andi I, N, 0x01 | |||
beq ZERO, I, .L_M0 | |||
fld.s F0, S1, 0x00 | |||
fld.s F1, S1, 0x04 | |||
fst.s F0, S10, 0x00 | |||
fst.s F1, S10, 0x04 | |||
.L_M0: | |||
LDARG $r23, $sp, 0 | |||
addi.d $sp, $sp, 8 | |||
jirl $r0, $r1, 0x00 | |||
EPILOGUE |
@@ -0,0 +1,383 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/********************************************************************* | |||
* 2024/02/20 guxiwei | |||
* UTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
* | |||
* | |||
*********************************************************************/ | |||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
*/ | |||
#define M $r4 | |||
#define N $r5 | |||
#define ALPHA_R $f0 | |||
#define ALPHA_I $f1 | |||
#define A $r7 | |||
#define LDA $r8 | |||
#define X $r9 | |||
#define INC_X $r10 | |||
#define Y $r11 | |||
#define INC_Y $r6 | |||
#define J $r12 | |||
#define I $r13 | |||
#define K $r14 | |||
#define Y_ORG $r15 | |||
#define OFFSET $r16 | |||
#define K_LDA $r17 | |||
#define M8 $r18 | |||
#define T0 $r19 | |||
#define PA0 $r20 | |||
#define PA1 $r23 | |||
#define PA2 $r24 | |||
#define PA3 $r25 | |||
#define PA4 $r26 | |||
#define PA5 $r27 | |||
#define PA6 $r28 | |||
#define PA7 $r29 | |||
#define VALPHA $xr1 | |||
#define X0 $xr2 | |||
#define X1 $xr3 | |||
#define X2 $xr4 | |||
#define X3 $xr5 | |||
#define X4 $xr6 | |||
#define X5 $xr7 | |||
#define X6 $xr8 | |||
#define X7 $xr9 | |||
#define Y0 $xr10 | |||
#define Y1 $xr11 | |||
#define A0 $xr12 | |||
#define A1 $xr13 | |||
#define A2 $xr14 | |||
#define A3 $xr15 | |||
#define A4 $xr16 | |||
#define A5 $xr17 | |||
#define A6 $xr18 | |||
#define A7 $xr19 | |||
#define A8 $xr20 | |||
#define A9 $xr21 | |||
#define A10 $xr22 | |||
#define A11 $xr23 | |||
#define A12 $xr24 | |||
#define A13 $xr25 | |||
#define A14 $xr26 | |||
#define A15 $xr27 | |||
#define TMP0 $xr28 | |||
#define TMP1 $xr29 | |||
#define TMP2 $xr30 | |||
#if !defined(CONJ) | |||
#if !defined(XCONJ) | |||
#define GXCONJ 0 | |||
#define GCONJ 0 | |||
#else | |||
#define GXCONJ 1 | |||
#define GCONJ 0 | |||
#endif | |||
#else | |||
#if !defined(XCONJ) | |||
#define GXCONJ 0 | |||
#define GCONJ 1 | |||
#else | |||
#define GXCONJ 1 | |||
#define GCONJ 1 | |||
#endif | |||
#endif | |||
.macro CLOAD_X_8 | |||
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ | |||
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 | |||
GCOMPLEXMUL GXCONJ, \ | |||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
X3, X3, VALPHA, TMP0, TMP1, TMP2, \ | |||
X4, X4, VALPHA, TMP0, TMP1, TMP2, \ | |||
X5, X5, VALPHA, TMP0, TMP1, TMP2, \ | |||
X6, X6, VALPHA, TMP0, TMP1, TMP2, \ | |||
X7, X7, VALPHA, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro CLOAD_X_8_GAP | |||
xvldrepl.d X0, X, 0x00 | |||
PTR_ADD T0, X, INC_X | |||
xvldrepl.d X1, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X2, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X3, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X4, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X5, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X6, T0, 0x00 | |||
PTR_ADD T0, T0, INC_X | |||
xvldrepl.d X7, T0, 0x00 | |||
GCOMPLEXMUL GXCONJ, \ | |||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
X3, X3, VALPHA, TMP0, TMP1, TMP2, \ | |||
X4, X4, VALPHA, TMP0, TMP1, TMP2, \ | |||
X5, X5, VALPHA, TMP0, TMP1, TMP2, \ | |||
X6, X6, VALPHA, TMP0, TMP1, TMP2, \ | |||
X7, X7, VALPHA, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro CLOAD_Y_8 | |||
GLD xv, , Y0, Y, 0, Y1, Y, 0x20 | |||
.endm | |||
.macro CLOAD_Y_8_GAP | |||
fld.d $f10, Y, 0 | |||
fldx.d $f13, Y, INC_Y | |||
PTR_ALSL T0, INC_Y, Y, 1 | |||
fld.d $f14, T0, 0 | |||
fldx.d $f15, T0, INC_Y | |||
PTR_ALSL T0, INC_Y, Y, 2 | |||
fld.d $f11, T0, 0 | |||
fldx.d $f17, T0, INC_Y | |||
PTR_ADD T0, T0, INC_Y | |||
PTR_ADD T0, T0, INC_Y | |||
fld.d $f18, T0, 0 | |||
fldx.d $f19, T0, INC_Y | |||
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 | |||
.endm | |||
.macro CSTORE_Y_8_GAP | |||
xvstelm.d Y0, Y, 0, 0 | |||
PTR_ADD T0, Y, INC_Y | |||
xvstelm.d Y0, T0, 0, 1 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y0, T0, 0, 2 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y0, T0, 0, 3 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y1, T0, 0, 0 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y1, T0, 0, 1 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y1, T0, 0, 2 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y1, T0, 0, 3 | |||
.endm | |||
.macro CGEMV_N_8x8 | |||
GLD_INC xv, , 0x20, \ | |||
A0, PA0, 0, A1, PA0, 0, \ | |||
A2, PA1, 0, A3, PA1, 0, \ | |||
A4, PA2, 0, A5, PA2, 0, \ | |||
A6, PA3, 0, A7, PA3, 0, \ | |||
A8, PA4, 0, A9, PA4, 0, \ | |||
A10, PA5, 0, A11, PA5, 0, \ | |||
A12, PA6, 0, A13, PA6, 0, \ | |||
A14, PA7, 0, A15, PA7, 0 | |||
GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ | |||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ | |||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ | |||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2, \ | |||
Y0, X4, A8, Y0, TMP0, TMP1, TMP2, Y1, X4, A9, Y1, TMP0, TMP1, TMP2, \ | |||
Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \ | |||
Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \ | |||
Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro CSTORE_Y_8 | |||
GST xv, , Y0, Y, 0, Y1, Y, 0x20 | |||
.endm | |||
.macro CLOAD_X_1 | |||
GLDREPL xv, d, X0, X, 0x00 | |||
GCOMPLEXMUL GXCONJ, \ | |||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro CLOAD_Y_1 | |||
fld.d $f10, Y, 0 | |||
.endm | |||
.macro CGEMV_N_1x8 | |||
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ | |||
$f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 | |||
GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ | |||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ | |||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ | |||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, \ | |||
Y0, X4, A8, Y0, TMP0, TMP1, TMP2, \ | |||
Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \ | |||
Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \ | |||
Y0, X7, A14, Y0, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro CSTORE_Y_1 | |||
fst.d $f10, Y, 0 | |||
.endm | |||
.macro CGEMV_N_1x1 | |||
fld.d $f12, PA0, 0 | |||
PTR_ADDI PA0, PA0, 0x08 | |||
GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req | |||
PTR_SRLI J, N, 3 | |||
beqz J, .L_\XW\()_N_7 | |||
PTR_SLLI K_LDA, LDA, 3 | |||
PTR_SUB K_LDA, K_LDA, M8 | |||
.L_\XW\()_N_L8: | |||
CLOAD_\X_8 | |||
xor K, K, K | |||
move Y, Y_ORG | |||
PTR_SRLI I, M, 3 | |||
beqz I, .L_\XW\()_M_7 | |||
.align 5 | |||
.L_\XW\()_M_L8: | |||
CLOAD_\Y_8 | |||
CGEMV_N_8x8 | |||
CSTORE_\Y_8 | |||
PTR_ADDI I, I, -1 | |||
PTR_ALSL Y, INC_Y, Y, 3 | |||
PTR_ADDI K, K, 8 | |||
bnez I, .L_\XW\()_M_L8 | |||
.L_\XW\()_M_7: | |||
andi I, M, 7 | |||
beqz I, .L_\XW\()_M_END | |||
.align 5 | |||
.L_\XW\()_M_L1: | |||
CLOAD_\Y_1 | |||
CGEMV_N_1x8 | |||
CSTORE_\Y_1 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD Y, Y, INC_Y | |||
PTR_ADDI K, K, 1 | |||
bnez I, .L_\XW\()_M_L1 | |||
.L_\XW\()_M_END: | |||
PTR_ADDI J, J, -1 | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
#elif __loongarch_grlen == 32 | |||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
#else | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
#endif | |||
PTR_ALSL X, INC_X, X, 3 | |||
bnez J, .L_\XW\()_N_L8 | |||
.L_\XW\()_N_7: | |||
andi J, N, 7 | |||
beqz J, .L_END | |||
.L_\XW\()_N_L1: | |||
CLOAD_\X_1 | |||
xor K, K, K | |||
move Y, Y_ORG | |||
move I, M | |||
beqz I, .L_END | |||
.align 5 | |||
.L_\XW\()_N_1_M_L1: | |||
CLOAD_\Y_1 | |||
CGEMV_N_1x1 | |||
CSTORE_\Y_1 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD Y, Y, INC_Y | |||
PTR_ADDI K, K, 1 | |||
bnez I, .L_\XW\()_N_1_M_L1 | |||
.L_\XW\()_N_1_M_END: | |||
PTR_ADDI J, J, -1 | |||
PTR_SUB K_LDA, LDA, M8 | |||
PTR_ADD PA0, PA0, K_LDA | |||
PTR_ADD X, X, INC_X | |||
bnez J, .L_\XW\()_N_L1 | |||
b .L_END | |||
.endm | |||
PROLOGUE | |||
PTR_LD INC_Y, $sp, 0 | |||
push_if_used 17 + 7, 31 | |||
PTR_ADDI K, $r0, 0x01 | |||
PTR_SUB I, INC_X, K | |||
PTR_SUB J, INC_Y, K | |||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||
PTR_ALSL I, I, J, 1 | |||
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
// Init VALPHA | |||
xvpackev.w $xr0, $xr1, $xr0 | |||
xvreplve0.d VALPHA, $xr0 | |||
move Y_ORG, Y | |||
move PA0, A | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
#elif __loongarch_grlen == 32 | |||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
#else | |||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
#endif | |||
la.local T0, .L_GAP_TABLE | |||
PTR_ALSL I, I, T0, 1 | |||
ld.h K, I, 0 // Obtain the offset address | |||
PTR_ADD T0, T0, K | |||
jirl $r0, T0, 0 | |||
.L_GAP_TABLE: | |||
.hword .L_GAP_0_0 - .L_GAP_TABLE | |||
.hword .L_GAP_0_1 - .L_GAP_TABLE | |||
.hword .L_GAP_1_0 - .L_GAP_TABLE | |||
.hword .L_GAP_1_1 - .L_GAP_TABLE | |||
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||
CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1 | |||
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||
CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1 | |||
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||
CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1 | |||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 | |||
.L_END: | |||
pop_if_used 17 + 7, 31 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE |
@@ -0,0 +1,342 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/********************************************************************* | |||
* 2022/02/20 guxiwei | |||
* UTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
* | |||
* | |||
*********************************************************************/ | |||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
*/ | |||
#define M $r4 | |||
#define N $r5 | |||
#define ALPHA_R $f0 | |||
#define ALPHA_I $f1 | |||
#define A $r7 | |||
#define LDA $r8 | |||
#define X $r9 | |||
#define INC_X $r10 | |||
#define Y $r11 | |||
#define INC_Y $r6 | |||
#define J $r12 | |||
#define I $r13 | |||
#define K $r14 | |||
#define PY0 $r14 | |||
#define X_ORG $r15 | |||
#define PY1 $r16 | |||
#define K_LDA $r17 | |||
#define PY2 $r18 | |||
#define T0 $r19 | |||
#define PA0 $r20 | |||
#define PA1 $r23 | |||
#define PA2 $r24 | |||
#define PA3 $r25 | |||
#define PA4 $r26 | |||
#define PA5 $r27 | |||
#define PA6 $r28 | |||
#define PA7 $r29 | |||
#define M8 $r30 | |||
#define VALPHA $xr0 | |||
#define X0 $xr1 | |||
#define X1 $xr2 | |||
#define A0 $xr3 | |||
#define A1 $xr4 | |||
#define A2 $xr5 | |||
#define A3 $xr6 | |||
#define A4 $xr7 | |||
#define A5 $xr8 | |||
#define A6 $xr9 | |||
#define A7 $xr10 | |||
#define A8 $xr11 | |||
#define A9 $xr12 | |||
#define A10 $xr13 | |||
#define A11 $xr14 | |||
#define A12 $xr15 | |||
#define A13 $xr16 | |||
#define A14 $xr17 | |||
#define A15 $xr18 | |||
#define TP0 $xr19 | |||
#define TP1 $xr20 | |||
#define TP2 $xr21 | |||
#define TP3 $xr22 | |||
#define TP4 $xr23 | |||
#define TP5 $xr24 | |||
#define TP6 $xr25 | |||
#define TP7 $xr26 | |||
#define TMP0 $xr27 | |||
#define TMP1 $xr28 | |||
#define TMP2 $xr29 | |||
#define Y0 $xr3 | |||
#define Y1 $xr4 | |||
#define Y2 $xr5 | |||
#define Y3 $xr6 | |||
#define Y4 $xr7 | |||
#define Y5 $xr8 | |||
#define Y6 $xr9 | |||
#define Y7 $xr10 | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
#define GXCONJ1 0 | |||
#define GCONJ1 0 | |||
#else | |||
#define GXCONJ1 1 | |||
#define GCONJ1 0 | |||
#endif | |||
#if !defined(XCONJ) | |||
#define GXCONJ2 0 | |||
#define GCONJ2 0 | |||
#else | |||
#define GXCONJ2 0 | |||
#define GCONJ2 1 | |||
#endif | |||
.macro ZERO_Y8 | |||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ | |||
TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 | |||
.endm | |||
.macro ZERO_Y1 | |||
GXOR xv, v, TP0, TP0, TP0 | |||
.endm | |||
.macro CLOAD_X8 | |||
GLD xv, , X0, X, 0x00, X1, X, 0x20 | |||
.endm | |||
.macro CLOAD_X8_GAP | |||
fld.d $f1, X, 0x00 | |||
fldx.d $f2, X, INC_X | |||
PTR_ALSL T0, INC_X, X, 1 | |||
fld.d $f3, T0, 0x00 | |||
fldx.d $f4, T0, INC_X | |||
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 | |||
PTR_ALSL T0, INC_X, X, 2 | |||
fld.d $f2, T0, 0x00 | |||
fldx.d $f3, T0, INC_X | |||
PTR_ALSL T0, INC_X, T0, 1 | |||
fld.d $f4, T0, 0x00 | |||
fldx.d $f5, T0, INC_X | |||
GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 | |||
.endm | |||
.macro CGEMV_T_8x8 | |||
GLD_INC xv, , 0x20, \ | |||
A0, PA0, 0, A1, PA0, 0, \ | |||
A2, PA1, 0, A3, PA1, 0, \ | |||
A4, PA2, 0, A5, PA2, 0, \ | |||
A6, PA3, 0, A7, PA3, 0, \ | |||
A8, PA4, 0, A9, PA4, 0, \ | |||
A10, PA5, 0, A11, PA5, 0, \ | |||
A12, PA6, 0, A13, PA6, 0, \ | |||
A14, PA7, 0, A15, PA7, 0 | |||
GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ | |||
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ | |||
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ | |||
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2, \ | |||
TP4, A8, X0, TP4, TMP0, TMP1, TMP2, TP4, A9, X1, TP4, TMP0, TMP1, TMP2, \ | |||
TP5, A10, X0, TP5, TMP0, TMP1, TMP2, TP5, A11, X1, TP5, TMP0, TMP1, TMP2, \ | |||
TP6, A12, X0, TP6, TMP0, TMP1, TMP2, TP6, A13, X1, TP6, TMP0, TMP1, TMP2, \ | |||
TP7, A14, X0, TP7, TMP0, TMP1, TMP2, TP7, A15, X1, TP7, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro CGEMV_T_LASX XW:req, X8:req | |||
PTR_SRLI J, N, 3 | |||
beqz J, .L_\XW\()_N_7 | |||
PTR_SLLI K_LDA, LDA, 3 | |||
PTR_SUB K_LDA, K_LDA, M8 | |||
.L_\XW\()_N_L8: | |||
ZERO_Y8 | |||
move X, X_ORG | |||
PTR_SRLI I, M, 3 | |||
beqz I, .L_\XW\()_M_7 | |||
.align 5 | |||
.L_\XW\()_M_L8: | |||
CLOAD_\X8 | |||
CGEMV_T_8x8 | |||
PTR_ADDI I, I, -1 | |||
PTR_ALSL X, INC_X, X, 3 | |||
bnez I, .L_\XW\()_M_L8 | |||
.L_\XW\()_M_7: | |||
// Accumulated | |||
GCOMPLEXACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ | |||
Y5, TP5, Y6, TP6, Y7, TP7 | |||
andi I, M, 7 | |||
beqz I, .L_\XW\()_M_END | |||
.align 5 | |||
.L_\XW\()_M_L1: | |||
fld.d $f1, X, 0x00 | |||
fld.d $f11, PA0, 0x00 | |||
fld.d $f12, PA1, 0x00 | |||
fld.d $f13, PA2, 0x00 | |||
fld.d $f14, PA3, 0x00 | |||
fld.d $f15, PA4, 0x00 | |||
fld.d $f16, PA5, 0x00 | |||
fld.d $f17, PA6, 0x00 | |||
fld.d $f18, PA7, 0x00 | |||
#if __loongarch_grlen == 64 | |||
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | |||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | |||
#elif __loongarch_grlen == 32 | |||
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | |||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | |||
#else | |||
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | |||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | |||
#endif | |||
GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
xvf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ | |||
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2, \ | |||
A4, A12, X0, A4, TMP0, TMP1, TMP2, A5, A13, X0, A5, TMP0, TMP1, TMP2, \ | |||
A6, A14, X0, A6, TMP0, TMP1, TMP2, A7, A15, X0, A7, TMP0, TMP1, TMP2 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD X, X, INC_X | |||
bnez I, .L_\XW\()_M_L1 | |||
.L_\XW\()_M_END: | |||
fld.d $f11, Y, 0x00 | |||
fldx.d $f12, Y, INC_Y | |||
PTR_ALSL PY0, INC_Y, Y, 1 | |||
fld.d $f13, PY0, 0x00 | |||
fldx.d $f14, PY0, INC_Y | |||
PTR_ALSL PY1, INC_Y, Y, 2 | |||
fld.d $f15, PY1, 0x00 | |||
fldx.d $f16, PY1, INC_Y | |||
PTR_ALSL PY2, INC_Y, PY1, 1 | |||
fld.d $f17, PY2, 0x00 | |||
fldx.d $f18, PY2, INC_Y | |||
GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
xvf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ | |||
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2,\ | |||
A12, VALPHA, A4, A12, TMP0, TMP1, TMP2, A13, VALPHA, A5, A13, TMP0, TMP1, TMP2,\ | |||
A14, VALPHA, A6, A14, TMP0, TMP1, TMP2, A15, VALPHA, A7, A15, TMP0, TMP1, TMP2 | |||
PTR_ADDI J, J, -1 | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
#elif __loongarch_grlen == 32 | |||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
#else | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
#endif | |||
fst.d $f11, Y, 0x00 | |||
fstx.d $f12, Y, INC_Y | |||
fst.d $f13, PY0, 0x00 | |||
fstx.d $f14, PY0, INC_Y | |||
fst.d $f15, PY1, 0x00 | |||
fstx.d $f16, PY1, INC_Y | |||
fst.d $f17, PY2, 0x00 | |||
fstx.d $f18, PY2, INC_Y | |||
PTR_ALSL Y, INC_Y, Y, 3 | |||
bnez J, .L_\XW\()_N_L8 | |||
.L_\XW\()_N_7: | |||
andi J, N, 7 | |||
beqz J, .L_END | |||
PTR_SUB K_LDA, LDA, M8 | |||
.L_\XW\()_N_1: | |||
ZERO_Y1 | |||
move X, X_ORG | |||
move I, M | |||
beqz I, .L_END | |||
.align 5 | |||
.L_\XW\()_N_1_M_L1: | |||
fld.d $f3, PA0, 0x00 | |||
fld.d $f1, X, 0x00 | |||
GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD X, X, INC_X | |||
PTR_ADDI PA0, PA0, 0x08 | |||
bnez I, .L_\XW\()_N_1_M_L1 | |||
.L_\XW\()_N_1_M_END: | |||
PTR_ADDI J, J, -1 | |||
fld.d $f3, Y, 0x00 | |||
GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
xvf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 | |||
fst.d $f3, Y, 0x00 | |||
PTR_ADD PA0, PA0, K_LDA | |||
PTR_ADD Y, Y, INC_Y | |||
bnez J, .L_\XW\()_N_1 | |||
b .L_END | |||
.endm | |||
PROLOGUE | |||
PTR_LD INC_Y, $sp, 0 | |||
push_if_used 17 + 8, 30 | |||
PTR_ADDI K, $r0, 0x01 | |||
PTR_SUB I, INC_X, K | |||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
// Init VALPHA | |||
xvpackev.w $xr0, $xr1, $xr0 | |||
xvreplve0.d VALPHA, $xr0 | |||
move X_ORG, X | |||
move PA0, A | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
#elif __loongarch_grlen == 32 | |||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
#else | |||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
#endif | |||
la.local T0, .L_GAP_TABLE | |||
PTR_ALSL I, I, T0, 1 | |||
ld.h K, I, 0 | |||
PTR_ADD T0, T0, K | |||
jirl $r0, T0, 0 | |||
.L_GAP_TABLE: | |||
.hword .L_GAP_0 - .L_GAP_TABLE | |||
.hword .L_GAP_1 - .L_GAP_TABLE | |||
.L_GAP_0: /* if (incx == 1) */ | |||
CGEMV_T_LASX GAP_0, X8 | |||
.L_GAP_1: /* if (incx != 1) */ | |||
CGEMV_T_LASX GAP_1, X8_GAP | |||
.L_END: | |||
pop_if_used 17 + 8, 30 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE |
@@ -384,6 +384,246 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endif | |||
.endm | |||
// | |||
// GCOMPLEXACC: Complex accumulate the values of vector registers | |||
// pre_op: xvf or vf, differentiate between LSX or LASX instruction | |||
// suf_op: s or d, differentiate between single precision or double precision complex numbers | |||
// Note: When "pre_op = xvf && suf_op = s", in will be modified. | |||
// | |||
.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | |||
.ifeqs "\pre_op", "xvf" | |||
xvpermi.q \out, \in, 0x01 | |||
.ifeqs "\suf_op", "s" | |||
\pre_op\()add.\suf_op \in, \out, \in | |||
xvpackod.d \out, \in, \in | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.else | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.endif | |||
.endif | |||
.ifeqs "\pre_op", "vf" | |||
.ifeqs "\suf_op", "s" | |||
vpackod.d \out, \in, \in | |||
\pre_op\()add.\suf_op \out, \out, \in | |||
.endif | |||
.endif | |||
.ifnb \more | |||
GCOMPLEXACC \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GCOMPLEXMUL: Complex multiplication, out = in0 * in1 | |||
// xconj: default value 0. | |||
// if !(xconj) | |||
// out_r = in0_r * in1_r - in0_i * in1_i; | |||
// out_i = in0_r * in1_i + in0_i * in1_r; | |||
// else | |||
// out_r = in0_r * in1_r + in0_i * in1_i; | |||
// out_i = in0_r * in1_i - in0_i * in1_r; | |||
// pre_op: xvf or vf, differentiate between LSX or LASX instruction | |||
// suf_op: s or d, differentiate between single precision or double precision complex numbers | |||
// | |||
.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg | |||
.ifeqs "\pre_op", "xvf" | |||
xvxor.v \tmp1, \tmp1, \tmp1 | |||
.ifeqs "\suf_op", "s" | |||
xvpackev.w \tmp0, \in0, \in0 | |||
.else | |||
xvpackev.d \tmp0, \in0, \in0 | |||
.endif | |||
.else | |||
vxor.v \tmp1, \tmp1, \tmp1 | |||
.ifeqs "\suf_op", "s" | |||
vpackev.w \tmp0, \in0, \in0 | |||
.else | |||
vpackev.d \tmp0, \in0, \in0 | |||
.endif | |||
.endif | |||
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 | |||
.ifeqs "\pre_op", "xvf" | |||
.ifeqs "\suf_op", "s" | |||
.ifeqs "\xconj", "0" | |||
xvpackod.w \tmp1, \in0, \tmp1 | |||
.else | |||
xvpackod.w \tmp1, \tmp1, \in0 | |||
.endif | |||
xvshuf4i.w \tmp2, \in1, 0xb1 | |||
.else | |||
.ifeqs "\xconj", "0" | |||
xvpackod.d \tmp1, \in0, \tmp1 | |||
.else | |||
xvpackod.d \tmp1, \tmp1, \in0 | |||
.endif | |||
xvshuf4i.d \tmp2, \in1, 0x0b | |||
.endif | |||
.else | |||
.ifeqs "\suf_op", "s" | |||
.ifeqs "\xconj", "0" | |||
vpackod.w \tmp1, \in0, \tmp1 | |||
.else | |||
vpackod.w \tmp1, \tmp1, \in0 | |||
.endif | |||
vshuf4i.w \tmp2, \in1, 0xb1 | |||
.else | |||
.ifeqs "\xconj", "0" | |||
vpackod.d \tmp1, \in0, \tmp1 | |||
.else | |||
vpackod.d \tmp1, \tmp1, \in0 | |||
.endif | |||
vshuf4i.d \tmp2, \in1, 0x0b | |||
.endif | |||
.endif | |||
\pre_op\()mul.\suf_op \out, \tmp0, \in1 | |||
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out | |||
.ifnb \more | |||
GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2 | |||
// xconj: default value 0 | |||
// conj: default value 0 | |||
// if !(CONJ) | |||
// if !(XCONJ) | |||
// out_r = in0_r * in1_r - in0_i * in1_i + in2_r; | |||
// out_i = in0_r * in1_i + in0_i * in1_r + in2_i; | |||
// else | |||
// out_r = in0_r * in1_r + in0_i * in1_i + in2_r; | |||
// out_i = in0_r * in1_i - in0_i * in1_r + in2_i; | |||
// else | |||
// if !(XCONJ) | |||
// out_r = in0_r * in1_r + in0_i * in1_i + in2_r; | |||
// out_i = in2_i - (in0_r * in1_i - in0_i * in1_r); | |||
// else | |||
// out_r = in0_r * in1_r - in0_i * in1_i + in2_r; | |||
// out_i = in2_i - (in0_r * in1_i + in0_i * in1_r); | |||
// pre_op: xvf or vf, differentiate between LSX or LASX instruction | |||
// suf_op: s or d, differentiate between single precision or double precision complex numbers | |||
// | |||
.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg | |||
.ifeqs "\pre_op", "xvf" | |||
xvxor.v \tmp1, \tmp1, \tmp1 | |||
.ifeqs "\suf_op", "s" | |||
xvpackev.w \tmp0, \in0, \in0 | |||
.else | |||
xvpackev.d \tmp0, \in0, \in0 | |||
.endif | |||
.else | |||
vxor.v \tmp1, \tmp1, \tmp1 | |||
.ifeqs "\suf_op", "s" | |||
vpackev.w \tmp0, \in0, \in0 | |||
.else | |||
vpackev.d \tmp0, \in0, \in0 | |||
.endif | |||
.endif | |||
\pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2 | |||
.ifeqs "\conj", "1" | |||
\pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 | |||
.ifeqs "\pre_op", "xvf" | |||
.ifeqs "\suf_op", "s" | |||
xvshuf4i.w \tmp0, \tmp0, 0xb1 | |||
xvpackev.w \out, \tmp0, \tmp2 | |||
.else | |||
xvshuf4i.d \tmp0, \tmp0, 0x0b | |||
xvpackev.d \out, \tmp0, \tmp2 | |||
.endif | |||
.else | |||
.ifeqs "\suf_op", "s" | |||
vshuf4i.w \tmp0, \tmp0, 0xb1 | |||
vpackev.w \out, \tmp0, \tmp2 | |||
.else | |||
vshuf4i.d \tmp0, \tmp0, 0x0b | |||
vpackev.d \out, \tmp0, \tmp2 | |||
.endif | |||
.endif /* pre_op = xvf */ | |||
.else | |||
\pre_op\()add.\suf_op \out, \tmp2, \tmp1 | |||
.endif /* conj = 1 */ | |||
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 | |||
.ifeqs "\pre_op", "xvf" | |||
.ifeqs "\suf_op", "s" | |||
.ifeqs "\conj", "0" | |||
.ifeqs "\xconj", "0" | |||
xvpackod.w \tmp1, \in0, \tmp1 | |||
.else | |||
xvpackod.w \tmp1, \tmp1, \in0 | |||
.endif | |||
.else | |||
.ifeqs "\xconj", "0" | |||
xvpackod.w \tmp1, \in0, \in0 | |||
.else | |||
xvpackod.w \tmp1, \tmp1, \tmp1 | |||
.endif | |||
.endif | |||
xvshuf4i.w \tmp2, \in1, 0xb1 | |||
.else | |||
.ifeqs "\conj", "0" | |||
.ifeqs "\xconj", "0" | |||
xvpackod.d \tmp1, \in0, \tmp1 | |||
.else | |||
xvpackod.d \tmp1, \tmp1, \in0 | |||
.endif | |||
.else | |||
.ifeqs "\xconj", "0" | |||
xvpackod.d \tmp1, \in0, \in0 | |||
.else | |||
xvpackod.d \tmp1, \tmp1, \tmp1 | |||
.endif | |||
.endif | |||
xvshuf4i.d \tmp2, \in1, 0x0b | |||
.endif | |||
.else | |||
.ifeqs "\suf_op", "s" | |||
.ifeqs "\conj", "0" | |||
.ifeqs "\xconj", "0" | |||
vpackod.w \tmp1, \in0, \tmp1 | |||
.else | |||
vpackod.w \tmp1, \tmp1, \in0 | |||
.endif | |||
.else | |||
.ifeqs "\xconj", "0" | |||
vpackod.w \tmp1, \in0, \in0 | |||
.else | |||
vpackod.w \tmp1, \tmp1, \tmp1 | |||
.endif | |||
.endif | |||
vshuf4i.w \tmp2, \in1, 0xb1 | |||
.else | |||
.ifeqs "\conj", "0" | |||
.ifeqs "\xconj", "0" | |||
vpackod.d \tmp1, \in0, \tmp1 | |||
.else | |||
vpackod.d \tmp1, \tmp1, \in0 | |||
.endif | |||
.else | |||
.ifeqs "\xconj", "0" | |||
vpackod.d \tmp1, \in0, \in0 | |||
.else | |||
vpackod.d \tmp1, \tmp1, \tmp1 | |||
.endif | |||
.endif | |||
vshuf4i.d \tmp2, \in1, 0x0b | |||
.endif | |||
.endif | |||
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out | |||
.ifnb \more | |||
GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more | |||
.endif | |||
.endm | |||
// | |||
// Media Related Macros | |||
// | |||
@@ -0,0 +1,343 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/********************************************************************* | |||
* 2024/02/20 guxiwei | |||
* UTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
* | |||
* | |||
*********************************************************************/ | |||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
*/ | |||
#define M $r4 | |||
#define N $r5 | |||
#define ALPHA_R $f0 | |||
#define ALPHA_I $f1 | |||
#define A $r7 | |||
#define LDA $r8 | |||
#define X $r9 | |||
#define INC_X $r10 | |||
#define Y $r11 | |||
#define INC_Y $r6 | |||
#define J $r12 | |||
#define I $r13 | |||
#define K $r14 | |||
#define Y_ORG $r15 | |||
#define OFFSET $r16 | |||
#define K_LDA $r17 | |||
#define M16 $r18 | |||
#define T0 $r19 | |||
#define PA0 $r20 | |||
#define PA1 $r23 | |||
#define PA2 $r24 | |||
#define PA3 $r25 | |||
#define PA4 $r26 | |||
#define PA5 $r27 | |||
#define PA6 $r28 | |||
#define PA7 $r29 | |||
#define VALPHA $xr1 | |||
#define X0 $xr2 | |||
#define X1 $xr3 | |||
#define X2 $xr4 | |||
#define X3 $xr5 | |||
#define X4 $xr6 | |||
#define X5 $xr7 | |||
#define X6 $xr8 | |||
#define X7 $xr9 | |||
#define Y0 $xr10 | |||
#define Y1 $xr11 | |||
#define A0 $xr12 | |||
#define A1 $xr13 | |||
#define A2 $xr14 | |||
#define A3 $xr15 | |||
#define A4 $xr16 | |||
#define A5 $xr17 | |||
#define A6 $xr18 | |||
#define A7 $xr19 | |||
#define A8 $xr20 | |||
#define A9 $xr21 | |||
#define A10 $xr22 | |||
#define A11 $xr23 | |||
#define A12 $xr24 | |||
#define A13 $xr25 | |||
#define A14 $xr26 | |||
#define A15 $xr27 | |||
#define TMP0 $xr28 | |||
#define TMP1 $xr29 | |||
#define TMP2 $xr30 | |||
#if !defined(CONJ) | |||
#if !defined(XCONJ) | |||
#define GXCONJ 0 | |||
#define GCONJ 0 | |||
#else | |||
#define GXCONJ 1 | |||
#define GCONJ 0 | |||
#endif | |||
#else | |||
#if !defined(XCONJ) | |||
#define GXCONJ 0 | |||
#define GCONJ 1 | |||
#else | |||
#define GXCONJ 1 | |||
#define GCONJ 1 | |||
#endif | |||
#endif | |||
.macro ZLOAD_X_4 | |||
GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30 | |||
GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0 | |||
GCOMPLEXMUL GXCONJ, \ | |||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
X3, X3, VALPHA, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro ZLOAD_X_4_GAP | |||
xvld X0, X, 0 | |||
xvpermi.q X0, X0, 0 | |||
PTR_ADD T0, X, INC_X | |||
xvld X1, T0, 0 | |||
xvpermi.q X1, X1, 0 | |||
PTR_ADD T0, T0, INC_X | |||
xvld X2, T0, 0 | |||
xvpermi.q X2, X2, 0 | |||
PTR_ADD T0, T0, INC_X | |||
xvld X3, T0, 0 | |||
xvpermi.q X3, X3, 0 | |||
GCOMPLEXMUL GXCONJ, \ | |||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
X3, X3, VALPHA, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro ZLOAD_Y_4 | |||
GLD xv, , Y0, Y, 0, Y1, Y, 0x20 | |||
.endm | |||
.macro ZLOAD_Y_4_GAP | |||
vld $vr10, Y, 0 | |||
vldx $vr13, Y, INC_Y | |||
PTR_ALSL T0, INC_Y, Y, 1 | |||
vld $vr11, T0, 0 | |||
vldx $vr14, T0, INC_Y | |||
GPERMI xv, q, Y0, A1, 0x02, Y1, A2, 0x02 | |||
.endm | |||
.macro ZGEMV_N_4x4 | |||
GLD_INC xv, , 0x20, \ | |||
A0, PA0, 0, A1, PA0, 0, \ | |||
A2, PA1, 0, A3, PA1, 0, \ | |||
A4, PA2, 0, A5, PA2, 0, \ | |||
A6, PA3, 0, A7, PA3, 0 | |||
GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ | |||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ | |||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ | |||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro ZSTORE_Y_4 | |||
GST xv, , Y0, Y, 0, Y1, Y, 0x20 | |||
.endm | |||
.macro ZSTORE_Y_4_GAP | |||
xvstelm.d Y0, Y, 0, 0 | |||
xvstelm.d Y0, Y, 0x08, 1 | |||
PTR_ADD T0, Y, INC_Y | |||
xvstelm.d Y0, T0, 0, 2 | |||
xvstelm.d Y0, T0, 0x08, 3 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y1, T0, 0, 0 | |||
xvstelm.d Y1, T0, 0x08, 1 | |||
PTR_ADD T0, T0, INC_Y | |||
xvstelm.d Y1, T0, 0, 2 | |||
xvstelm.d Y1, T0, 0x08, 3 | |||
.endm | |||
.macro ZLOAD_Y_1 | |||
vld $vr10, Y, 0 | |||
.endm | |||
.macro ZGEMV_N_1x4 | |||
GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0, $vr16, PA2, 0, $vr18, PA3, 0 | |||
GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ | |||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ | |||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ | |||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro ZSTORE_Y_1 | |||
vst $vr10, Y, 0 | |||
.endm | |||
.macro ZLOAD_X_1 | |||
GLD xv, , X0, X, 0x00 | |||
GPERMI xv, q, X0, X0, 0 | |||
GCOMPLEXMUL GXCONJ, \ | |||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro ZGEMV_N_1x1 | |||
GLD_INC v, , 0x10, $vr12, PA0, 0 | |||
GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro ZGEMV_N_LASX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req | |||
PTR_SRLI J, N, 2 | |||
beqz J, .L_\XW\()_N_3 | |||
PTR_SLLI K_LDA, LDA, 2 | |||
PTR_SUB K_LDA, K_LDA, M16 | |||
.L_\XW\()_N_L4: | |||
ZLOAD_\X_4 | |||
xor K, K, K | |||
move Y, Y_ORG | |||
PTR_SRLI I, M, 2 | |||
beqz I, .L_\XW\()_M_3 | |||
.align 5 | |||
.L_\XW\()_M_L4: | |||
ZLOAD_\Y_4 | |||
ZGEMV_N_4x4 | |||
ZSTORE_\Y_4 | |||
PTR_ADDI I, I, -1 | |||
PTR_ALSL Y, INC_Y, Y, 2 | |||
PTR_ADDI K, K, 4 | |||
bnez I, .L_\XW\()_M_L4 | |||
.L_\XW\()_M_3: | |||
andi I, M, 3 | |||
beqz I, .L_\XW\()_M_END | |||
.align 5 | |||
.L_\XW\()_M_L1: | |||
ZLOAD_\Y_1 | |||
ZGEMV_N_1x4 | |||
ZSTORE_\Y_1 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD Y, Y, INC_Y | |||
PTR_ADDI K, K, 1 | |||
bnez I, .L_\XW\()_M_L1 | |||
.L_\XW\()_M_END: | |||
PTR_ADDI J, J, -1 | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
#elif __loongarch_grlen == 32 | |||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
#else | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
#endif | |||
PTR_ALSL X, INC_X, X, 2 | |||
bnez J, .L_\XW\()_N_L4 | |||
.L_\XW\()_N_3: | |||
andi J, N, 3 | |||
beqz J, .L_END | |||
.L_\XW\()_N_L1: | |||
ZLOAD_\X_1 | |||
xor K, K, K | |||
move Y, Y_ORG | |||
move I, M | |||
beqz I, .L_END | |||
.align 5 | |||
.L_\XW\()_N_1_M_L1: | |||
ZLOAD_\Y_1 | |||
ZGEMV_N_1x1 | |||
ZSTORE_\Y_1 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD Y, Y, INC_Y | |||
PTR_ADDI K, K, 1 | |||
bnez I, .L_\XW\()_N_1_M_L1 | |||
.L_\XW\()_N_1_M_END: | |||
PTR_ADDI J, J, -1 | |||
PTR_SUB K_LDA, LDA, M16 | |||
PTR_ADD PA0, PA0, K_LDA | |||
PTR_ADD X, X, INC_X | |||
bnez J, .L_\XW\()_N_L1 | |||
b .L_END | |||
.endm | |||
PROLOGUE | |||
PTR_LD INC_Y, $sp, 0 | |||
push_if_used 17 + 7, 31 | |||
PTR_ADDI K, $r0, 0x01 | |||
PTR_SUB I, INC_X, K | |||
PTR_SUB J, INC_Y, K | |||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||
PTR_ALSL I, I, J, 1 | |||
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 | |||
// Init VALPHA | |||
xvpackev.d $xr0, $xr1, $xr0 | |||
xvreplve0.q VALPHA, $xr0 | |||
move Y_ORG, Y | |||
move PA0, A | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA | |||
#elif __loongarch_grlen == 32 | |||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA | |||
#else | |||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA | |||
#endif | |||
la.local T0, .L_GAP_TABLE | |||
PTR_ALSL I, I, T0, 1 | |||
ld.h K, I, 0 // Obtain the offset address | |||
PTR_ADD T0, T0, K | |||
jirl $r0, T0, 0 | |||
.L_GAP_TABLE: | |||
.hword .L_GAP_0_0 - .L_GAP_TABLE | |||
.hword .L_GAP_0_1 - .L_GAP_TABLE | |||
.hword .L_GAP_1_0 - .L_GAP_TABLE | |||
.hword .L_GAP_1_1 - .L_GAP_TABLE | |||
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||
ZGEMV_N_LASX GAP_0_0, X_4, X_1, Y_4, Y_1 | |||
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||
ZGEMV_N_LASX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 | |||
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||
ZGEMV_N_LASX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 | |||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 | |||
.L_END: | |||
pop_if_used 17 + 7, 31 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE | |||
@@ -0,0 +1,299 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/********************************************************************* | |||
* 2024/02/20 guxiwei | |||
* UTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
* | |||
* | |||
*********************************************************************/ | |||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
*/ | |||
#define M $r4 | |||
#define N $r5 | |||
#define ALPHA_R $f0 | |||
#define ALPHA_I $f1 | |||
#define A $r7 | |||
#define LDA $r8 | |||
#define X $r9 | |||
#define INC_X $r10 | |||
#define Y $r11 | |||
#define INC_Y $r6 | |||
#define J $r12 | |||
#define I $r13 | |||
#define K $r14 | |||
#define PY0 $r14 | |||
#define X_ORG $r15 | |||
#define PY1 $r16 | |||
#define K_LDA $r17 | |||
#define PY2 $r18 | |||
#define T0 $r19 | |||
#define PA0 $r20 | |||
#define PA1 $r23 | |||
#define PA2 $r24 | |||
#define PA3 $r25 | |||
#define PA4 $r26 | |||
#define PA5 $r27 | |||
#define PA6 $r28 | |||
#define PA7 $r29 | |||
#define M16 $r30 | |||
#define VALPHA $xr0 | |||
#define X0 $xr1 | |||
#define X1 $xr2 | |||
#define A0 $xr3 | |||
#define A1 $xr4 | |||
#define A2 $xr5 | |||
#define A3 $xr6 | |||
#define A4 $xr7 | |||
#define A5 $xr8 | |||
#define A6 $xr9 | |||
#define A7 $xr10 | |||
#define A8 $xr11 | |||
#define A9 $xr12 | |||
#define A10 $xr13 | |||
#define A11 $xr14 | |||
#define A12 $xr15 | |||
#define A13 $xr16 | |||
#define A14 $xr17 | |||
#define A15 $xr18 | |||
#define TP0 $xr19 | |||
#define TP1 $xr20 | |||
#define TP2 $xr21 | |||
#define TP3 $xr22 | |||
#define TP4 $xr23 | |||
#define TP5 $xr24 | |||
#define TP6 $xr25 | |||
#define TP7 $xr26 | |||
#define TMP0 $xr27 | |||
#define TMP1 $xr28 | |||
#define TMP2 $xr29 | |||
#define Y0 $xr3 | |||
#define Y1 $xr4 | |||
#define Y2 $xr5 | |||
#define Y3 $xr6 | |||
#define Y4 $xr7 | |||
#define Y5 $xr8 | |||
#define Y6 $xr9 | |||
#define Y7 $xr10 | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
#define GXCONJ1 0 | |||
#define GCONJ1 0 | |||
#else | |||
#define GXCONJ1 1 | |||
#define GCONJ1 0 | |||
#endif | |||
#if !defined(XCONJ) | |||
#define GXCONJ2 0 | |||
#define GCONJ2 0 | |||
#else | |||
#define GXCONJ2 0 | |||
#define GCONJ2 1 | |||
#endif | |||
.macro ZERO_Y4 | |||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 | |||
.endm | |||
.macro ZERO_Y1 | |||
GXOR xv, v, TP0, TP0, TP0 | |||
.endm | |||
.macro ZLOAD_X4 | |||
GLD xv, , X0, X, 0x00, X1, X, 0x20 | |||
.endm | |||
.macro ZLOAD_X4_GAP | |||
xvld X0, X, 0 | |||
PTR_ADD T0, X, INC_X | |||
xvld A0, T0, 0 | |||
xvpermi.q X0, A0, 0x02 | |||
PTR_ADD T0, T0, INC_X | |||
xvld X1, T0, 0 | |||
PTR_ADD T0, T0, INC_X | |||
xvld A0, T0, 0 | |||
xvpermi.q X1, A0, 0x02 | |||
.endm | |||
.macro ZGEMV_T_4x4 | |||
GLD_INC xv, , 0x20, \ | |||
A0, PA0, 0, A1, PA0, 0, \ | |||
A2, PA1, 0, A3, PA1, 0, \ | |||
A4, PA2, 0, A5, PA2, 0, \ | |||
A6, PA3, 0, A7, PA3, 0 | |||
GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ | |||
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ | |||
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ | |||
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 | |||
.endm | |||
.macro ZGEMV_T_LASX XW:req, X4:req | |||
PTR_SRLI J, N, 2 | |||
beqz J, .L_\XW\()_N_3 | |||
PTR_SLLI K_LDA, LDA, 2 | |||
PTR_SUB K_LDA, K_LDA, M16 | |||
.L_\XW\()_N_L4: | |||
ZERO_Y4 | |||
move X, X_ORG | |||
PTR_SRLI I, M, 2 | |||
beqz I, .L_\XW\()_M_3 | |||
.align 5 | |||
.L_\XW\()_M_L4: | |||
ZLOAD_\X4 | |||
ZGEMV_T_4x4 | |||
PTR_ADDI I, I, -1 | |||
PTR_ALSL X, INC_X, X, 2 | |||
bnez I, .L_\XW\()_M_L4 | |||
.L_\XW\()_M_3: | |||
// Accumulated | |||
GCOMPLEXACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 | |||
andi I, M, 3 | |||
beqz I, .L_\XW\()_M_END | |||
.align 5 | |||
.L_\XW\()_M_L1: | |||
GLD xv, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00, A10, PA2, 0x00, A11, PA3, 0x00 | |||
#if __loongarch_grlen == 64 | |||
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 | |||
#elif __loongarch_grlen == 32 | |||
GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 | |||
#else | |||
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 | |||
#endif | |||
GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
xvf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ | |||
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD X, X, INC_X | |||
bnez I, .L_\XW\()_M_L1 | |||
.L_\XW\()_M_END: | |||
xvld A8, Y, 0x00 | |||
xvldx A9, Y, INC_Y | |||
PTR_ALSL PY0, INC_Y, Y, 1 | |||
xvld A10, PY0, 0x00 | |||
xvldx A11, PY0, INC_Y | |||
GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
xvf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ | |||
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 | |||
PTR_ADDI J, J, -1 | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
#elif __loongarch_grlen == 32 | |||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
#else | |||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
#endif | |||
vst $vr11, Y, 0x00 | |||
vstx $vr12, Y, INC_Y | |||
vst $vr13, PY0, 0x00 | |||
vstx $vr14, PY0, INC_Y | |||
PTR_ALSL Y, INC_Y, Y, 2 | |||
bnez J, .L_\XW\()_N_L4 | |||
.L_\XW\()_N_3: | |||
andi J, N, 3 | |||
beqz J, .L_END | |||
PTR_SUB K_LDA, LDA, M16 | |||
.L_\XW\()_N_1: | |||
ZERO_Y1 | |||
move X, X_ORG | |||
move I, M | |||
beqz I, .L_END | |||
.align 5 | |||
.L_\XW\()_N_1_M_L1: | |||
GLD xv, , A0, PA0, 0x00, X0, X, 0x00 | |||
GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 | |||
PTR_ADDI I, I, -1 | |||
PTR_ADD X, X, INC_X | |||
PTR_ADDI PA0, PA0, 0x10 | |||
bnez I, .L_\XW\()_N_1_M_L1 | |||
.L_\XW\()_N_1_M_END: | |||
PTR_ADDI J, J, -1 | |||
xvld A0, Y, 0x00 | |||
GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
xvf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 | |||
vst $vr3, Y, 0x00 | |||
PTR_ADD PA0, PA0, K_LDA | |||
PTR_ADD Y, Y, INC_Y | |||
bnez J, .L_\XW\()_N_1 | |||
b .L_END | |||
.endm | |||
PROLOGUE | |||
PTR_LD INC_Y, $sp, 0 | |||
push_if_used 17 + 8, 30 | |||
PTR_ADDI K, $r0, 0x01 | |||
PTR_SUB I, INC_X, K | |||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 | |||
// Init VALPHA | |||
xvpackev.d $xr0, $xr1, $xr0 | |||
xvreplve0.q VALPHA, $xr0 | |||
move X_ORG, X | |||
move PA0, A | |||
#if __loongarch_grlen == 64 | |||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
#elif __loongarch_grlen == 32 | |||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
#else | |||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
#endif | |||
la.local T0, .L_GAP_TABLE | |||
PTR_ALSL I, I, T0, 1 | |||
ld.h K, I, 0 | |||
PTR_ADD T0, T0, K | |||
jirl $r0, T0, 0 | |||
.L_GAP_TABLE: | |||
.hword .L_GAP_0 - .L_GAP_TABLE | |||
.hword .L_GAP_1 - .L_GAP_TABLE | |||
.L_GAP_0: /* if (incx == 1) */ | |||
ZGEMV_T_LASX GAP_0, X4 | |||
.L_GAP_1: /* if (incx != 1) */ | |||
ZGEMV_T_LASX GAP_1, X4_GAP | |||
.L_END: | |||
pop_if_used 17 + 8, 30 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE |
@@ -30,19 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if !defined(DOUBLE) | |||
#define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
#define FLOAT_V_T vfloat32m8_t | |||
#define FLOAT_V_M1_T vfloat32m1_t | |||
#define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
#define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
#define VSEV_FLOAT_M1 __riscv_vse32_v_f32m1 | |||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
#else | |||
#define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
#define FLOAT_V_T vfloat64m8_t | |||
#define FLOAT_V_M1_T vfloat64m1_t | |||
#define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
#define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
#define VSEV_FLOAT_M1 __riscv_vse64_v_f64m1 | |||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
#endif | |||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
@@ -76,7 +86,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
VSEV_FLOAT(y, vy, vl); | |||
} | |||
} else if (1 == inc_x) { | |||
} else if (1 == inc_x && 0 != inc_y) { | |||
BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
@@ -89,8 +99,20 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
VSSEV_FLOAT(y, stride_y, vy, vl); | |||
} | |||
} else { | |||
} else if( 0 == inc_y ) { | |||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
size_t in_vl = VSETVL(n); | |||
vy = VFMVVF_FLOAT( y[0], in_vl ); | |||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
vl = VSETVL(n); | |||
vx = VLSEV_FLOAT(x, stride_x, vl); | |||
vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
} | |||
FLOAT_V_M1_T vres = VFMVVF_FLOAT_M1( 0.0f, 1 ); | |||
vres = VFREDSUMVS_FLOAT( vy, vres, in_vl ); | |||
VSEV_FLOAT_M1(y, vres, 1); | |||
} else { | |||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
@@ -51,11 +51,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
#define FLOAT_V_M1_T JOIN(vfloat, ELEN, m1, _t, _) | |||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||
#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||
#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) | |||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
#ifdef RISCV_0p10_INTRINSICS | |||
#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||
#else | |||
#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
#endif | |||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
{ | |||
@@ -123,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
VSEV_FLOAT(&y[j], vy0, gvl); | |||
j += gvl; | |||
} | |||
}else if(inc_x == 1){ | |||
} else if (1 == inc_x && 0 != inc_y) { | |||
stride_y = inc_y * sizeof(FLOAT); | |||
gvl = VSETVL(n); | |||
if(gvl <= n/2){ | |||
@@ -151,6 +160,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | |||
j += gvl; | |||
} | |||
} else if( 0 == inc_y ) { | |||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
size_t in_vl = VSETVL(n); | |||
vy0 = VFMVVF_FLOAT( y[0], in_vl ); | |||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
vl = VSETVL(n); | |||
vx0 = VLSEV_FLOAT(x, stride_x, vl); | |||
vy0 = VFMACCVF_FLOAT(vy0, da, vx0, vl); | |||
} | |||
FLOAT_V_M1_T v_res = VFMVVF_FLOAT_M1( 0.0f, 1 ); | |||
v_res = VFREDSUMVS_FLOAT( vy0, v_res, in_vl ); | |||
y[0] = EXTRACT_FLOAT(v_res); | |||
}else{ | |||
stride_x = inc_x * sizeof(FLOAT); | |||
stride_y = inc_y * sizeof(FLOAT); | |||
@@ -101,8 +101,10 @@ SCLAUX = la_constants.o \ | |||
slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ | |||
slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ | |||
ssteqr.o ssterf.o slaisnan.o sisnan.o \ | |||
slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o \ | |||
../INSTALL/second_$(TIMER).o | |||
slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o | |||
ifneq ($(F_COMPILER), IBM) | |||
SCLAUX += ../INSTALL/second_$(TIMER).o | |||
endif | |||
endif | |||
ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" | |||
@@ -124,7 +126,10 @@ DZLAUX = la_constants.o\ | |||
dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ | |||
dsteqr.o dsterf.o dlaisnan.o disnan.o \ | |||
dlartgp.o dlartgs.o ../INSTALL/droundup_lwork.o \ | |||
../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o | |||
../INSTALL/dlamch.o | |||
ifneq ($(F_COMPILER), IBM) | |||
DZLAUX += ../INSTALL/dsecnd_$(TIMER).o | |||
endif | |||
endif | |||
#ifeq ($(BUILD_SINGLE),1) | |||
@@ -107,6 +107,12 @@ set(ZDMDEIGTST zchkdmd.f90) | |||
macro(add_eig_executable name) | |||
add_executable(${name} ${ARGN}) | |||
target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
target_link_libraries(${name} omp pthread) | |||
endif() | |||
#${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) | |||
endmacro() | |||
@@ -240,6 +240,10 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr | |||
macro(add_lin_executable name) | |||
add_executable(${name} ${ARGN}) | |||
target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
target_link_libraries(${name} omp pthread) | |||
endif() | |||
#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) | |||
endmacro() | |||
@@ -2845,21 +2845,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||
#define SGEMM_DEFAULT_UNROLL_N 8 | |||
#define SGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||
#define CGEMM_DEFAULT_UNROLL_M 1 | |||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||
#define ZGEMM_DEFAULT_UNROLL_M 1 | |||
#else | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 16 | |||
#define SGEMM_DEFAULT_UNROLL_N 8 | |||
#define SGEMM_DEFAULT_UNROLL_M 16 | |||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||
#define CGEMM_DEFAULT_UNROLL_M 16 | |||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||
#define ZGEMM_DEFAULT_UNROLL_M 8 | |||
#endif | |||
#define QGEMM_DEFAULT_UNROLL_N 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||
#define XGEMM_DEFAULT_UNROLL_N 1 | |||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||
#define ZGEMM_DEFAULT_UNROLL_M 8 | |||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||
#define SGEMM_DEFAULT_P 256 | |||
@@ -21,10 +21,14 @@ endif() | |||
if (BUILD_COMPLEX16) | |||
list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) | |||
endif() | |||
message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID}) | |||
foreach(test_bin ${OpenBLAS_Tests}) | |||
add_executable(${test_bin} ${test_bin}.f) | |||
target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) | |||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
target_link_libraries(${test_bin} omp pthread) | |||
endif() | |||
endforeach() | |||
# $1 exec, $2 input, $3 output_result | |||